From c969f18fba1a073757271e5da104e4f8b83f332e Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Thu, 11 Jan 2024 21:49:22 +0000
Subject: [PATCH 01/49] ENH: show the compiled statement by Halide.

---
 doc/book/random/separable_conv_2d.stmt.html | 19874 ++++++++++++++++++
 doc/book/random/vector_intrinsics.Rmd       |    10 +
 2 files changed, 19884 insertions(+)
 create mode 100644 doc/book/random/separable_conv_2d.stmt.html
diff --git a/doc/book/random/separable_conv_2d.stmt.html b/doc/book/random/separable_conv_2d.stmt.html
new file mode 100644
index 000000000..3c011333f
--- /dev/null
+++ b/doc/book/random/separable_conv_2d.stmt.html
@@ -0,0 +1,19874 @@
+<html>
+<head>
+<title>Visualizing Module: conv_y</title>
+<!-- Bootstrap links -->
+<link href='https://cdn.jsdelivr.net/npm/bootstrap@5.2.0/dist/css/bootstrap.min.css' rel='stylesheet' integrity='sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx' crossorigin='anonymous'>
+
+<script src='https://cdn.jsdelivr.net/npm/bootstrap@5.2.0/dist/js/bootstrap.bundle.min.js' integrity='sha384-A3rJD856KowSb7dwlZdYEkO39Gagi7vIsF0jrRAoQmDKKtQBHUuLZ9AsSv4jD4Xa' crossorigin='anonymous'></script>
+
+<link rel='stylesheet' href='https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css'>
+<link rel='stylesheet' href='https://cdn.jsdelivr.net/npm/bootstrap-icons@1.5.0/font/bootstrap-icons.css'>
+
+<!-- Tooltip links -->
+<script src='https://cdn.jsdelivr.net/npm/@floating-ui/core@1.0.1'></script>
+<script src='https://cdn.jsdelivr.net/npm/@floating-ui/dom@1.0.1'></script>
+
+<!-- Hierarchy links -->
+<link rel='stylesheet' href='https://unpkg.com/treeflex/dist/css/treeflex.css'>
+
+<!-- Expand Button links -->
+<link href='http://maxcdn.bootstrapcdn.com/font-awesome/4.1.0/css/font-awesome.min.css' rel='stylesheet'>
+
+<script src='http://code.jquery.com/jquery-1.10.2.js'></script>
+
+<!-- Assembly Code links -->
+<link rel='stylesheet' href='https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.52.2/codemirror.min.css'></link>
+<script src='https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.52.2/codemirror.min.js'></script>
+<script src='https://cdnjs.cloudflare.com/ajax/libs/codemirror/6.65.7/mode/gas/gas.min.js'></script>
+<script src='https://cdnjs.cloudflare.com/ajax/libs/codemirror/6.65.7/addon/selection/mark-selection.min.js'></script>
+<script src='https://cdnjs.cloudflare.com/ajax/libs/codemirror/6.65.7/addon/search/searchcursor.min.js'></script>
+<script src='https://cdnjs.cloudflare.com/ajax/libs/codemirror/6.65.7/addon/search/search.min.js'></script><style type='text/css'>
+    /* General CSS Rules*/
+    body {
+        font-family: Consolas, 'Liberation Mono', Menlo, Courier, monospace;
+        font-size: 12px;
+        background: #f8f8f8;
+        margin-left: 15px;
+    }
+
+    div#page-container {
+        height: 100vh;
+        display: flex;
+        flex-direction: column;
+    }
+
+    a,
+    a:hover,
+    a:visited,
+    a:active {
+        color: inherit;
+        text-decoration: none;
+    }
+
+    b {
+        font-weight: normal;
+    }
+
+    table {
+        font-size: 12px;
+    }
+
+    /* Visualization tabs */
+    div#visualization-tabs {
+        display: flex;
+        flex-grow: 1;
+        width: 100%;
+        overflow: hidden;
+        border-top: 1px solid rgb(200, 200, 200)
+    }
+
+    div#ir-code-tab {
+        counter-reset: line;
+        padding-left: 50px;
+        padding-top: 20px;
+        overflow-y: scroll;
+        position: relative;
+    }
+
+    div#ir-visualization-tab {
+        overflow-y: scroll;
+        padding-top: 20px;
+        padding-left: 20px;
+        position: relative;
+    }
+
+    /* Resize bars */
+    div#visualization-tabs div.resize-bar {
+        background: rgb(201, 231, 190);
+        cursor: col-resize;
+        border-left: 1px solid rgb(0, 0, 0);
+        border-right: 1px solid rgb(0, 0, 0);
+    }
+
+        div#visualization-tabs div.resize-bar div.collapse-btns {
+            position: relative;
+            top: 50%;
+        }
+
+        div#visualization-tabs div.resize-bar div.collapse-btns {
+            margin: 0px;
+        }
+
+    button.resize-btn {
+        margin: 10px 3px;
+        font-size: 18px;
+    }
+
+    /* IR Code Section CSS */
+    b.Highlight {
+        font-weight: bold;
+        background-color: #DDD;
+    }
+
+    span.Highlight {
+        font-weight: bold;
+        background-color: #FF0;
+    }
+
+    span.OpF32 {
+        color: hsl(106deg 100% 40%);
+        font-weight: bold;
+    }
+
+    span.OpF64 {
+        color: hsl(106deg 100% 30%);
+        font-weight: bold;
+    }
+
+    span.OpB8 {
+        color: hsl(208deg 100% 80%);
+        font-weight: bold;
+    }
+
+    span.OpB16 {
+        color: hsl(208deg 100% 70%);
+        font-weight: bold;
+    }
+
+    span.OpB32 {
+        color: hsl(208deg 100% 60%);
+        font-weight: bold;
+    }
+
+    span.OpB64 {
+        color: hsl(208deg 100% 50%);
+        font-weight: bold;
+    }
+
+    span.OpI8 {
+        color: hsl(46deg 100% 45%);
+        font-weight: bold;
+    }
+
+    span.OpI16 {
+        color: hsl(46deg 100% 40%);
+        font-weight: bold;
+    }
+
+    span.OpI32 {
+        color: hsl(46deg 100% 34%);
+        font-weight: bold;
+    }
+
+    span.OpI64 {
+        color: hsl(46deg 100% 27%);
+        font-weight: bold;
+    }
+
+    span.OpVec2 {
+        background-color: hsl(100deg 100% 90%);
+        font-weight: bold;
+    }
+
+    span.OpVec4 {
+        background-color: hsl(100deg 100% 80%);
+        font-weight: bold;
+    }
+
+    span.Memory {
+        color: #d22;
+        font-weight: bold;
+    }
+
+    span.Pred {
+        background-color: #ffe8bd;
+        font-weight: bold;
+    }
+
+    span.Label {
+        background-color: #bde4ff;
+        font-weight: bold;
+    }
+
+    div.show-hide-btn-wrapper {
+        position: relative;
+        width: 0;
+        height: 0;
+    }
+
+    div.show-hide-btn {
+        position: absolute;
+        left: -14px;
+        top: 4px;
+        font-size: 8pt;
+    }
+
+        div.show-hide-btn:hover {
+            color: #c30000;
+        }
+
+    button.icon-btn {
+        border: 0px;
+        background: transparent;
+        color: black;
+        font-size: 11pt;
+        display: inline-block;
+        vertical-align: middle;
+        margin-right: 5px;
+        margin-left: 5px;
+        padding: 0px;
+    }
+
+    div#ir-visualization-tab button.icon-btn {
+        margin-left: 1px;
+    }
+
+    button.icon-btn:hover {
+        color: #c30000;
+    }
+
+    code.ptx {
+        tab-size: 26;
+        white-space: pre;
+    }
+
+    div.indent {
+        padding-left: 15px;
+    }
+
+    span.comment {
+        color: #998;
+        font-style: italic;
+    }
+
+    span.keyword {
+        color: #333;
+        font-weight: bold;
+    }
+
+    span.IntImm {
+        color: #099;
+    }
+
+    span.UIntImm {
+        color: #099;
+    }
+
+    span.FloatImm {
+        color: #099;
+    }
+
+    span.StringImm {
+        color: #d14;
+    }
+
+    span.Type {
+        color: #445588;
+        font-weight: bold;
+    }
+
+    span.Symbol {
+        color: #990073;
+    }
+
+    span.Assign {
+        color: #d14;
+        font-weight: bold;
+    }
+
+    p.WrapLine {
+        margin: 0px;
+        margin-left: 30px;
+        text-indent: -30px;
+    }
+
+    div.WrapLine {
+        margin-left: 30px;
+        text-indent: -30px;
+    }
+
+    p.WrapLine,
+    span.IfSpan,
+    span.ClosingBrace,
+    div.Module,
+    div.WrapLine,
+    div.Consumer,
+    div.Produce,
+    div.For,
+    div.Evaluate,
+    div.Allocate,
+    div.Function {
+        counter-increment: line;
+    }
+
+        p.WrapLine:before,
+        span.IfSpan:before,
+        span.ClosingBrace:before,
+        div.WrapLine:before,
+        div.Consumer:before,
+        div.Produce:before,
+        div.For:before,
+        div.Evaluate:before,
+        div.Allocate:before,
+        div.Module:before,
+        div.Function:before {
+            content: counter(line) '. ';
+            display: inline-block;
+            position: absolute;
+            left: 0px;
+            color: rgb(175, 175, 175);
+            user-select: none;
+            -webkit-user-select: none;
+        }
+
+        p.WrapLine:before,
+        div.WrapLine:before {
+            left: 30px;
+        }
+
+    .collapsed-block {
+        position: absolute;
+        left: -9999px;
+        max-height: 0px;
+        overflow: hidden;
+        opacity: 0;
+    }
+
+    /* IR Viz Section CSS */
+    .collapsed-viz {
+        display: none;
+    }
+
+    div.fn-wrapper {
+        background-color: #f0f0f0;
+        border: 1px dashed grey;
+        margin-bottom: 15px;
+        width: max-content;
+    }
+
+    div.fn-header {
+        display: flex;
+        font-weight: bold;
+        padding: 5px 7px;
+        background-color: #dfdede;
+    }
+
+    span.fn-title {
+        margin-right: 10px;
+        margin-left: 3px;
+    }
+
+    div.fn-body {
+        padding: 7px;
+    }
+
+    div.fn-call {
+        padding: 3px 5px;
+        background: #f1daeb;
+        border: 1px dashed gray;
+    }
+
+    div.box {
+        border: 1px dashed grey;
+        width: max-content;
+        min-width: -webkit-fill-available;
+        display: flex;
+        flex-direction: column;
+    }
+
+        div.box:not(:last-child) {
+            margin-bottom: 15px;
+        }
+
+    div.box-header {
+        padding: 5px 7px;
+        display: flex;
+    }
+
+    div.box-title {
+        font-weight: bold;
+        margin-top: auto;
+        margin-bottom: auto;
+        margin-left: 3px;
+    }
+
+    div.viz-cost-btns {
+        margin: auto 0px auto auto;
+        display: flex;
+        padding-left: 10px;
+    }
+
+        div.viz-cost-btns div {
+            width: 10px;
+            height: 17.02px;
+            border: 1px solid gray;
+        }
+
+            div.viz-cost-btns div:hover {
+                border: 1px solid black;
+            }
+
+    div.box-body {
+        padding: 7px;
+    }
+
+    div.ProducerBox {
+        background-color: #e9fbe4;
+    }
+
+        div.ProducerBox > div.box-header {
+            background-color: #c6edbb;
+        }
+
+    div.ConsumerBox {
+        background-color: #ffe6e8;
+    }
+
+        div.ConsumerBox > div.box-header {
+            background-color: #f1c8cc;
+        }
+
+    div.ForBox {
+        background-color: #f0e9f9;
+    }
+
+        div.ForBox > div.box-header {
+            background-color: #ddc6fb;
+        }
+
+    div.IfBox {
+        background-color: #e6eeff;
+    }
+
+        div.IfBox > div.box-header {
+            background-color: #c0d4ff;
+        }
+
+    div.AllocateBox {
+        background-color: #f4f8bf;
+    }
+
+        div.AllocateBox > div.box-header {
+            background-color: #f5e790;
+        }
+
+    div.LoadBox {
+        background-color: #fff4e6;
+    }
+
+        div.LoadBox > div.box-header {
+            background-color: #fbe3c6;
+        }
+
+    div.StoreBox {
+        background-color: #ddf3e9;
+    }
+
+        div.StoreBox > div.box-header {
+            background-color: #bcedd6;
+        }
+
+    .if-root-node {
+        width: 200px !important;
+        border: 1px dashed gray !important;
+        text-align: center;
+        padding: 3px !important;
+        background: #c0d4ff;
+    }
+
+    .allocate-table {
+        background-color: rgba(197, 195, 195, 0.2);
+        min-width: -webkit-fill-available;
+    }
+
+        .allocate-table:not(:last-child) {
+            margin-bottom: 10px;
+        }
+
+        .allocate-table td, .allocate-table th {
+            padding: 5px;
+            border: 1px dashed gray;
+        }
+
+    button.trunc-cond {
+        font-size: 12px;
+        vertical-align: middle;
+        padding: 0px;
+        line-height: 1.5;
+        background: none;
+        border: none;
+    }
+
+    span.tooltip {
+        display: none;
+        position: absolute;
+        top: 0;
+        left: 0;
+        width: max-content;
+        padding: 8px;
+        background: #fff7e0;
+        font-size: 12px;
+        border-radius: 5px;
+        border: 1px solid #aaa;
+        z-index: 9999;
+        box-shadow: rgba(100, 100, 100, 0.8) 0 2px 5px 0;
+        text-indent: 0px;
+    }
+
+    .conditionTooltip {
+        width: 300px;
+        padding: 5px;
+        font-family: Consolas, 'Liberation Mono', Menlo, Courier, monospace;
+    }
+
+    /* TreeFlex CSS */
+    .tf-tree {
+        overflow: unset;
+    }
+
+        .tf-tree ul {
+            width: -webkit-fill-available;
+        }
+
+    .tf-custom-ir-viz li {
+        padding: 0px !important;
+        flex-grow: 1;
+    }
+
+    .tf-tree.tf-gap-sm li > .tf-nc:after, .tf-tree.tf-gap-sm li > .tf-node-content:after {
+        /*height: 1em;*/
+    }
+
+    .tf-custom-ir-viz {
+        font-size: 12px;
+    }
+
+        .tf-custom-ir-viz:not(:last-child) {
+            margin-bottom: 15px;
+        }
+
+        .tf-custom-ir-viz .tf-nc {
+            border: none;
+            margin: 0px;
+            padding: 0px;
+            width: -webkit-fill-available;
+        }
+
+            .tf-custom-ir-viz .tf-nc:before,
+            .tf-custom-ir-viz .tf-nc:after {
+                border-left-width: 1px;
+            }
+
+        .tf-custom-ir-viz li:not(:first-child) > .if-node {
+            padding-left: 8px !important;
+        }
+
+        .tf-custom-ir-viz li li:before {
+            border-top-width: 1px;
+        }
+
+        .tf-custom-ir-viz .end-node {
+            border-style: dashed;
+        }
+
+    /* CodeMirror */
+    .CodeMirror {
+        height: 100%;
+        width: 100%;
+    }
+
+    /* Cost model */
+
+    div.node-cost {
+        position: absolute;
+        left: 35px;
+        display: flex;
+    }
+
+    div.cost-btn {
+        width: 10px;
+        height: 14px;
+        margin-top: 2px;
+    }
+
+        div.cost-btn:not(:last-child) {
+            margin-right: 3px;
+            margin-right: 3px;
+        }
+
+        div.cost-btn:hover {
+            cursor: pointer;
+            border: 1px solid lightgray;
+        }
+
+    .CostColor19 {
+        background-color: rgb(130, 31, 27);
+    }
+
+    .CostColor18 {
+        background-color: rgb(145, 33, 30);
+    }
+
+    .CostColor17 {
+        background-color: rgb(160, 33, 32);
+    }
+
+    .CostColor16 {
+        background-color: rgb(176, 34, 34);
+    }
+
+    .CostColor15 {
+        background-color: rgb(185, 47, 32);
+    }
+
+    .CostColor14 {
+        background-color: rgb(193, 59, 30);
+    }
+
+    .CostColor13 {
+        background-color: rgb(202, 71, 27);
+    }
+
+    .CostColor12 {
+        background-color: rgb(210, 82, 22);
+    }
+
+    .CostColor11 {
+        background-color: rgb(218, 93, 16);
+    }
+
+    .CostColor10 {
+        background-color: rgb(226, 104, 6);
+    }
+
+    .CostColor9 {
+        background-color: rgb(229, 118, 9);
+    }
+
+    .CostColor8 {
+        background-color: rgb(230, 132, 15);
+    }
+
+    .CostColor7 {
+        background-color: rgb(231, 146, 20);
+    }
+
+    .CostColor6 {
+        background-color: rgb(232, 159, 25);
+    }
+
+    .CostColor5 {
+        background-color: rgb(233, 172, 30);
+    }
+
+    .CostColor4 {
+        background-color: rgb(233, 185, 35);
+    }
+
+    .CostColor3 {
+        background-color: rgb(233, 198, 40);
+    }
+
+    .CostColor2 {
+        background-color: rgb(232, 211, 45);
+    }
+
+    .CostColor1 {
+        background-color: rgb(231, 223, 50);
+    }
+
+    .CostColor0 {
+        background-color: rgb(236,233,89);
+    }
+</style></head>
+<body>
+  <div id='page-container'>
+<div id='visualization-tabs'>
+<div id='ir-code-tab'>
+<div class='Module' id='0-1'><a onclick='return toggle(0);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=0-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=0-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='2-3'><span class='keyword' id='4-5'>module</span> name=conv_y, target=x86-64-linux-avx-avx2-f16c-fma-sse41</span></a><span class='matched' id='2-7'> {</span><div class='indent ModuleBody' id='0'><div class='Function' id='9-11'><a onclick='return toggle(10);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=10-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=10-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='12-13'><span class='keyword nav-anchor' id='lowered-func-conv_y_par_for_conv_y_s0_x_xo_tile'>func </span>conv_y_par_for_conv_y_s0_x_xo_tile(</span><b class='variable matched' id='16-17'>__user_context</b><span class='matched' id='12-18'>,</span> <b class='variable matched' id='20-21'>conv_y.s0.x.xo.tile</b><span class='matched' id='12-22'>,</span> <b class='variable matched' id='24-25'>closure_arg</b><span class='matched' id='12-26'>)</span></a><button class='icon-btn sync-btn' onclick='scrollToViz("lowered-func-viz-10")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='12-28'>{</span><div class='indent FunctionBody' id='10'><div class='LetStmt' id='30-33'><div class='node-cost' id='34-35'><div id='cc-31' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-31'   line-cost='1' block-cost='70'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-31' class='tooltip cond-tooltop' role='tooltip-cc-31'>Op Count: 1</span><div id='dc-31' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-31'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-31' class='tooltip cond-tooltop' role='tooltip-dc-31'>Bits Moved: 0</span></div><p class='WrapLine' id='34-37'><span class='cost-highlight' id='cost-bg-31'><span class='matched' id='39-40'><span class='keyword' id='41-42'>let </span><b class='variable matched' id='32-44'>closure_prototype</b><span class='Operator Assign' id='41-45'> = </span></span><span class='Call' id='39-48'><span class='nav-anchor' id='fn-call-47'><span class='matched' id='50-51'><span class='Symbol matched' id='52-53'>make_struct</span>(</span><span class='Reinterpret' id='50-55'><span class='matched' id='56-57'><span class='Type' id='58-59'>(void *)</span>(</span><span class='UIntImm Imm' id='56-61'>(uint64)0</span><span class='matched' id='56-63'>)</span></span><span class='matched' id='50-65'>, </span><span class='Reinterpret' id='50-67'><span class='matched' id='68-69'><span class='Type' id='70-71'>(void *)</span>(</span><span class='UIntImm Imm' id='68-73'>(uint64)0</span><span class='matched' id='68-75'>)</span></span><span class='matched' id='50-77'>, </span><span class='IntImm Imm' id='50-79'>0</span><span class='matched' id='50-81'>, </span><span class='IntImm Imm' id='50-83'>0</span><span class='matched' id='50-85'>, </span><span class='IntImm Imm' id='50-87'>0</span><span class='matched' id='50-89'>, </span><span class='IntImm Imm' id='50-91'>0</span><span class='matched' id='50-93'>, </span><span class='IntImm Imm' id='50-95'>0</span><span class='matched' id='50-97'>, </span><span class='IntImm Imm' id='50-99'>0</span><span class='matched' id='50-101'>, </span><span class='IntImm Imm' id='50-103'>0</span><span class='matched' id='50-105'>)</span></span></span></span></p><div class='LetStmt' id='34-109'><div class='node-cost' id='110-111'><div id='cc-107' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-107'   line-cost='1' block-cost='69'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-107' class='tooltip cond-tooltop' role='tooltip-cc-107'>Op Count: 1</span><div id='dc-107' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-107'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-107' class='tooltip cond-tooltop' role='tooltip-dc-107'>Bits Moved: 0</span></div><p class='WrapLine' id='110-113'><span class='cost-highlight' id='cost-bg-107'><span class='matched' id='115-116'><span class='keyword' id='117-118'>let </span><b class='variable matched' id='108-120'>conv_y</b><span class='Operator Assign' id='117-121'> = </span></span><span class='Call' id='115-124'><span class='nav-anchor' id='fn-call-123'><span class='matched' id='126-127'><span class='Symbol matched' id='128-129'>load_typed_struct_member</span>(</span><b class='variable matched' id='24-131'>closure_arg</b><span class='matched' id='126-132'>, </span><b class='variable matched' id='32-134'>closure_prototype</b><span class='matched' id='126-135'>, </span><span class='IntImm Imm' id='126-137'>0</span><span class='matched' id='126-139'>)</span></span></span></span></p><div class='LetStmt' id='110-143'><div class='node-cost' id='144-145'><div id='cc-141' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-141'   line-cost='1' block-cost='68'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-141' class='tooltip cond-tooltop' role='tooltip-cc-141'>Op Count: 1</span><div id='dc-141' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-141'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-141' class='tooltip cond-tooltop' role='tooltip-dc-141'>Bits Moved: 0</span></div><p class='WrapLine' id='144-147'><span class='cost-highlight' id='cost-bg-141'><span class='matched' id='149-150'><span class='keyword' id='151-152'>let </span><b class='variable matched' id='142-154'>kernel</b><span class='Operator Assign' id='151-155'> = </span></span><span class='Call' id='149-158'><span class='nav-anchor' id='fn-call-157'><span class='matched' id='160-161'><span class='Symbol matched' id='162-163'>load_typed_struct_member</span>(</span><b class='variable matched' id='24-165'>closure_arg</b><span class='matched' id='160-166'>, </span><b class='variable matched' id='32-168'>closure_prototype</b><span class='matched' id='160-169'>, </span><span class='IntImm Imm' id='160-171'>1</span><span class='matched' id='160-173'>)</span></span></span></span></p><div class='LetStmt' id='144-177'><div class='node-cost' id='178-179'><div id='cc-175' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-175'   line-cost='1' block-cost='67'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-175' class='tooltip cond-tooltop' role='tooltip-cc-175'>Op Count: 1</span><div id='dc-175' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-175'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-175' class='tooltip cond-tooltop' role='tooltip-dc-175'>Bits Moved: 0</span></div><p class='WrapLine' id='178-181'><span class='cost-highlight' id='cost-bg-175'><span class='matched' id='183-184'><span class='keyword' id='185-186'>let </span><b class='variable matched' id='176-188'>conv_y.extent.0</b><span class='Operator Assign' id='185-189'> = </span></span><span class='Call' id='183-192'><span class='nav-anchor' id='fn-call-191'><span class='matched' id='194-195'><span class='Symbol matched' id='196-197'>load_typed_struct_member</span>(</span><b class='variable matched' id='24-199'>closure_arg</b><span class='matched' id='194-200'>, </span><b class='variable matched' id='32-202'>closure_prototype</b><span class='matched' id='194-203'>, </span><span class='IntImm Imm' id='194-205'>2</span><span class='matched' id='194-207'>)</span></span></span></span></p><div class='LetStmt' id='178-211'><div class='node-cost' id='212-213'><div id='cc-209' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-209'   line-cost='1' block-cost='66'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-209' class='tooltip cond-tooltop' role='tooltip-cc-209'>Op Count: 1</span><div id='dc-209' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-209'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-209' class='tooltip cond-tooltop' role='tooltip-dc-209'>Bits Moved: 0</span></div><p class='WrapLine' id='212-215'><span class='cost-highlight' id='cost-bg-209'><span class='matched' id='217-218'><span class='keyword' id='219-220'>let </span><b class='variable matched' id='210-222'>conv_y.extent.1</b><span class='Operator Assign' id='219-223'> = </span></span><span class='Call' id='217-226'><span class='nav-anchor' id='fn-call-225'><span class='matched' id='228-229'><span class='Symbol matched' id='230-231'>load_typed_struct_member</span>(</span><b class='variable matched' id='24-233'>closure_arg</b><span class='matched' id='228-234'>, </span><b class='variable matched' id='32-236'>closure_prototype</b><span class='matched' id='228-237'>, </span><span class='IntImm Imm' id='228-239'>3</span><span class='matched' id='228-241'>)</span></span></span></span></p><div class='LetStmt' id='212-245'><div class='node-cost' id='246-247'><div id='cc-243' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-243'   line-cost='1' block-cost='65'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-243' class='tooltip cond-tooltop' role='tooltip-cc-243'>Op Count: 1</span><div id='dc-243' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-243'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-243' class='tooltip cond-tooltop' role='tooltip-dc-243'>Bits Moved: 0</span></div><p class='WrapLine' id='246-249'><span class='cost-highlight' id='cost-bg-243'><span class='matched' id='251-252'><span class='keyword' id='253-254'>let </span><b class='variable matched' id='244-256'>conv_y.min.1</b><span class='Operator Assign' id='253-257'> = </span></span><span class='Call' id='251-260'><span class='nav-anchor' id='fn-call-259'><span class='matched' id='262-263'><span class='Symbol matched' id='264-265'>load_typed_struct_member</span>(</span><b class='variable matched' id='24-267'>closure_arg</b><span class='matched' id='262-268'>, </span><b class='variable matched' id='32-270'>closure_prototype</b><span class='matched' id='262-271'>, </span><span class='IntImm Imm' id='262-273'>4</span><span class='matched' id='262-275'>)</span></span></span></span></p><div class='LetStmt' id='246-279'><div class='node-cost' id='280-281'><div id='cc-277' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-277'   line-cost='1' block-cost='64'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-277' class='tooltip cond-tooltop' role='tooltip-cc-277'>Op Count: 1</span><div id='dc-277' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-277'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-277' class='tooltip cond-tooltop' role='tooltip-dc-277'>Bits Moved: 0</span></div><p class='WrapLine' id='280-283'><span class='cost-highlight' id='cost-bg-277'><span class='matched' id='285-286'><span class='keyword' id='287-288'>let </span><b class='variable matched' id='278-290'>conv_y.stride.1</b><span class='Operator Assign' id='287-291'> = </span></span><span class='Call' id='285-294'><span class='nav-anchor' id='fn-call-293'><span class='matched' id='296-297'><span class='Symbol matched' id='298-299'>load_typed_struct_member</span>(</span><b class='variable matched' id='24-301'>closure_arg</b><span class='matched' id='296-302'>, </span><b class='variable matched' id='32-304'>closure_prototype</b><span class='matched' id='296-305'>, </span><span class='IntImm Imm' id='296-307'>5</span><span class='matched' id='296-309'>)</span></span></span></span></p><div class='LetStmt' id='280-313'><div class='node-cost' id='314-315'><div id='cc-311' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-311'   line-cost='1' block-cost='63'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-311' class='tooltip cond-tooltop' role='tooltip-cc-311'>Op Count: 1</span><div id='dc-311' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-311'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-311' class='tooltip cond-tooltop' role='tooltip-dc-311'>Bits Moved: 0</span></div><p class='WrapLine' id='314-317'><span class='cost-highlight' id='cost-bg-311'><span class='matched' id='319-320'><span class='keyword' id='321-322'>let </span><b class='variable matched' id='312-324'>t139</b><span class='Operator Assign' id='321-325'> = </span></span><span class='Call' id='319-328'><span class='nav-anchor' id='fn-call-327'><span class='matched' id='330-331'><span class='Symbol matched' id='332-333'>load_typed_struct_member</span>(</span><b class='variable matched' id='24-335'>closure_arg</b><span class='matched' id='330-336'>, </span><b class='variable matched' id='32-338'>closure_prototype</b><span class='matched' id='330-339'>, </span><span class='IntImm Imm' id='330-341'>6</span><span class='matched' id='330-343'>)</span></span></span></span></p><div class='LetStmt' id='314-347'><div class='node-cost' id='348-349'><div id='cc-345' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-345'   line-cost='1' block-cost='62'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-345' class='tooltip cond-tooltop' role='tooltip-cc-345'>Op Count: 1</span><div id='dc-345' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-345'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-345' class='tooltip cond-tooltop' role='tooltip-dc-345'>Bits Moved: 0</span></div><p class='WrapLine' id='348-351'><span class='cost-highlight' id='cost-bg-345'><span class='matched' id='353-354'><span class='keyword' id='355-356'>let </span><b class='variable matched' id='346-358'>t140</b><span class='Operator Assign' id='355-359'> = </span></span><span class='Call' id='353-362'><span class='nav-anchor' id='fn-call-361'><span class='matched' id='364-365'><span class='Symbol matched' id='366-367'>load_typed_struct_member</span>(</span><b class='variable matched' id='24-369'>closure_arg</b><span class='matched' id='364-370'>, </span><b class='variable matched' id='32-372'>closure_prototype</b><span class='matched' id='364-373'>, </span><span class='IntImm Imm' id='364-375'>7</span><span class='matched' id='364-377'>)</span></span></span></span></p><div class='LetStmt' id='348-381'><div class='node-cost' id='382-383'><div id='cc-379' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-379'   line-cost='1' block-cost='61'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-379' class='tooltip cond-tooltop' role='tooltip-cc-379'>Op Count: 1</span><div id='dc-379' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-379'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-379' class='tooltip cond-tooltop' role='tooltip-dc-379'>Bits Moved: 0</span></div><p class='WrapLine' id='382-385'><span class='cost-highlight' id='cost-bg-379'><span class='matched' id='387-388'><span class='keyword' id='389-390'>let </span><b class='variable matched' id='380-392'>t141</b><span class='Operator Assign' id='389-393'> = </span></span><span class='Call' id='387-396'><span class='nav-anchor' id='fn-call-395'><span class='matched' id='398-399'><span class='Symbol matched' id='400-401'>load_typed_struct_member</span>(</span><b class='variable matched' id='24-403'>closure_arg</b><span class='matched' id='398-404'>, </span><b class='variable matched' id='32-406'>closure_prototype</b><span class='matched' id='398-407'>, </span><span class='IntImm Imm' id='398-409'>8</span><span class='matched' id='398-411'>)</span></span></span></span></p><div class='LetStmt' id='382-415'><div class='node-cost' id='416-417'><div id='cc-413' class='cost-btn CostColor4'   aria-describedby='tooltip-cc-413'   line-cost='4' block-cost='60'   line-cost-color='4' block-cost-color='19'></div><span id='tooltip-cc-413' class='tooltip cond-tooltop' role='tooltip-cc-413'>Op Count: 4</span><div id='dc-413' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-413'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-413' class='tooltip cond-tooltop' role='tooltip-dc-413'>Bits Moved: 0</span></div><p class='WrapLine' id='416-419'><span class='cost-highlight' id='cost-bg-413'><span class='matched' id='421-422'><span class='keyword' id='423-424'>let </span><b class='variable matched' id='414-426'>conv_y.s0.x.xi.base.s</b><span class='Operator Assign' id='423-427'> = </span></span><span class='Min' id='421-429'><span class='matched' id='430-431'><span class='Symbol matched' id='432-433'>min</span>(</span><span class='BinaryOp' id='430-435'><span class='matched' id='436-437'>(</span><span class='BinaryOp' id='436-439'><span class='matched' id='440-441'>(</span><b class='variable matched' id='20-443'>conv_y.s0.x.xo.tile</b> <span class='matched Operator' id='440-444'>%</span> <b class='variable matched' id='312-446'>t139</b><span class='matched' id='440-447'>)</span></span> <span class='matched Operator' id='436-449'>*</span> <span class='IntImm Imm' id='436-451'>64</span><span class='matched' id='436-453'>)</span></span><span class='matched' id='430-455'>, </span><span class='BinaryOp' id='430-457'><span class='matched' id='458-459'>(</span><b class='variable matched' id='176-461'>conv_y.extent.0</b> <span class='matched Operator' id='458-462'>+</span> <span class='IntImm Imm' id='458-464'>-64</span><span class='matched' id='458-466'>)</span></span><span class='matched' id='430-468'>)</span></span></span></p><div class='LetStmt' id='416-472'><div class='node-cost' id='473-474'><div id='cc-470' class='cost-btn CostColor4'   aria-describedby='tooltip-cc-470'   line-cost='4' block-cost='56'   line-cost-color='4' block-cost-color='19'></div><span id='tooltip-cc-470' class='tooltip cond-tooltop' role='tooltip-cc-470'>Op Count: 4</span><div id='dc-470' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-470'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-470' class='tooltip cond-tooltop' role='tooltip-dc-470'>Bits Moved: 0</span></div><p class='WrapLine' id='473-476'><span class='cost-highlight' id='cost-bg-470'><span class='matched' id='478-479'><span class='keyword' id='480-481'>let </span><b class='variable matched' id='471-483'>conv_y.s0.y.yi.base.s</b><span class='Operator Assign' id='480-484'> = </span></span><span class='Min' id='478-486'><span class='matched' id='487-488'><span class='Symbol matched' id='489-490'>min</span>(</span><span class='BinaryOp' id='487-492'><span class='matched' id='493-494'>(</span><span class='BinaryOp' id='493-496'><span class='matched' id='497-498'>(</span><b class='variable matched' id='20-500'>conv_y.s0.x.xo.tile</b> <span class='matched Operator' id='497-501'>/</span> <b class='variable matched' id='312-503'>t139</b><span class='matched' id='497-504'>)</span></span> <span class='matched Operator' id='493-506'>*</span> <span class='IntImm Imm' id='493-508'>64</span><span class='matched' id='493-510'>)</span></span><span class='matched' id='487-512'>, </span><span class='BinaryOp' id='487-514'><span class='matched' id='515-516'>(</span><b class='variable matched' id='210-518'>conv_y.extent.1</b> <span class='matched Operator' id='515-519'>+</span> <span class='IntImm Imm' id='515-521'>-64</span><span class='matched' id='515-523'>)</span></span><span class='matched' id='487-525'>)</span></span></span></p><div class='LetStmt' id='473-529'><div class='node-cost' id='530-531'><div id='cc-527' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-527'   line-cost='2' block-cost='52'   line-cost-color='2' block-cost-color='19'></div><span id='tooltip-cc-527' class='tooltip cond-tooltop' role='tooltip-cc-527'>Op Count: 2</span><div id='dc-527' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-527'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-527' class='tooltip cond-tooltop' role='tooltip-dc-527'>Bits Moved: 0</span></div><p class='WrapLine' id='530-533'><span class='cost-highlight' id='cost-bg-527'><span class='matched' id='535-536'><span class='keyword' id='537-538'>let </span><b class='variable matched' id='528-540'>t142</b><span class='Operator Assign' id='537-541'> = </span></span><span class='BinaryOp' id='535-543'><span class='matched' id='544-545'>(</span><span class='BinaryOp' id='544-547'><span class='matched' id='548-549'>(</span><b class='variable matched' id='414-551'>conv_y.s0.x.xi.base.s</b> <span class='matched Operator' id='548-552'>+</span> <b class='variable matched' id='346-554'>t140</b><span class='matched' id='548-555'>)</span></span> <span class='matched Operator' id='544-557'>+</span> <b class='variable matched' id='471-559'>conv_y.s0.y.yi.base.s</b><span class='matched' id='544-560'>)</span></span></span></p><div class='LetStmt' id='530-564'><div class='node-cost' id='565-566'><div id='cc-562' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-562'   line-cost='1' block-cost='50'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-562' class='tooltip cond-tooltop' role='tooltip-cc-562'>Op Count: 1</span><div id='dc-562' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-562'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-562' class='tooltip cond-tooltop' role='tooltip-dc-562'>Bits Moved: 0</span></div><p class='WrapLine' id='565-568'><span class='cost-highlight' id='cost-bg-562'><span class='matched' id='570-571'><span class='keyword' id='572-573'>let </span><b class='variable matched' id='563-575'>t143</b><span class='Operator Assign' id='572-576'> = </span></span><span class='BinaryOp' id='570-578'><span class='matched' id='579-580'>(</span><b class='variable matched' id='414-582'>conv_y.s0.x.xi.base.s</b> <span class='matched Operator' id='579-583'>+</span> <b class='variable matched' id='380-585'>t141</b><span class='matched' id='579-586'>)</span></span></span></p><div class='LetStmt' id='565-590'><div class='node-cost' id='591-592'><div id='cc-588' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-588'   line-cost='1' block-cost='49'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-588' class='tooltip cond-tooltop' role='tooltip-cc-588'>Op Count: 1</span><div id='dc-588' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-588'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-588' class='tooltip cond-tooltop' role='tooltip-dc-588'>Bits Moved: 0</span></div><p class='WrapLine' id='591-594'><span class='cost-highlight' id='cost-bg-588'><span class='matched' id='596-597'><span class='keyword' id='598-599'>let </span><b class='variable matched' id='589-601'>t144</b><span class='Operator Assign' id='598-602'> = </span></span><span class='BinaryOp' id='596-604'><span class='matched' id='605-606'>(</span><b class='variable matched' id='244-608'>conv_y.min.1</b> <span class='matched Operator' id='605-609'>+</span> <b class='variable matched' id='471-611'>conv_y.s0.y.yi.base.s</b><span class='matched' id='605-612'>)</span></span></span></p><div class='For' id='591-615'><div class='node-cost' id='616-617'><div id='cc-614' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-614'   line-cost='0' block-cost='48'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-cc-614' class='tooltip cond-tooltop' role='tooltip-cc-614'>Op Count: 0</span><div id='dc-614' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-614'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-614' class='tooltip cond-tooltop' role='tooltip-dc-614'>Bits Moved: 0</span></div><a onclick='return toggle(614);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=614-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=614-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='616-619'><span class='keyword nav-anchor' id='loop-614'>for</span> (</span><b class='variable matched' id='614-622'>conv_y.s0.y.yi</b><span class='matched' id='616-623'>, </span><span class='IntImm Imm' id='616-625'>0</span><span class='matched' id='616-627'>, </span><span class='IntImm Imm' id='616-629'>64</span><span class='matched' id='616-631'>)</span></a><button class='icon-btn sync-btn' onclick='scrollToViz("loop-viz-614")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='616-633'>{</span><div class='indent ForBody' id='614'><div class='LetStmt' id='635-638'><div class='node-cost' id='639-640'><div id='cc-636' class='cost-btn CostColor3'   aria-describedby='tooltip-cc-636'   line-cost='3' block-cost='48'   line-cost-color='3' block-cost-color='19'></div><span id='tooltip-cc-636' class='tooltip cond-tooltop' role='tooltip-cc-636'>Op Count: 3</span><div id='dc-636' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-636'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-636' class='tooltip cond-tooltop' role='tooltip-dc-636'>Bits Moved: 0</span></div><p class='WrapLine' id='639-642'><span class='cost-highlight' id='cost-bg-636'><span class='matched' id='644-645'><span class='keyword' id='646-647'>let </span><b class='variable matched' id='637-649'>t146</b><span class='Operator Assign' id='646-650'> = </span></span><span class='BinaryOp' id='644-652'><span class='matched' id='653-654'>(</span><span class='BinaryOp' id='653-656'><span class='matched' id='657-658'>(</span><span class='BinaryOp' id='657-660'><span class='matched' id='661-662'>(</span><b class='variable matched' id='614-664'>conv_y.s0.y.yi</b> <span class='matched Operator' id='661-665'>+</span> <b class='variable matched' id='589-667'>t144</b><span class='matched' id='661-668'>)</span></span> <span class='matched Operator' id='657-670'>*</span> <b class='variable matched' id='278-672'>conv_y.stride.1</b><span class='matched' id='657-673'>)</span></span> <span class='matched Operator' id='653-675'>+</span> <b class='variable matched' id='563-677'>t143</b><span class='matched' id='653-678'>)</span></span></span></p><div class='LetStmt' id='639-682'><div class='node-cost' id='683-684'><div id='cc-680' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-680'   line-cost='1' block-cost='45'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-680' class='tooltip cond-tooltop' role='tooltip-cc-680'>Op Count: 1</span><div id='dc-680' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-680'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-680' class='tooltip cond-tooltop' role='tooltip-dc-680'>Bits Moved: 0</span></div><p class='WrapLine' id='683-686'><span class='cost-highlight' id='cost-bg-680'><span class='matched' id='688-689'><span class='keyword' id='690-691'>let </span><b class='variable matched' id='681-693'>t145</b><span class='Operator Assign' id='690-694'> = </span></span><span class='BinaryOp' id='688-696'><span class='matched' id='697-698'>(</span><b class='variable matched' id='614-700'>conv_y.s0.y.yi</b> <span class='matched Operator' id='697-701'>+</span> <b class='variable matched' id='528-703'>t142</b><span class='matched' id='697-704'>)</span></span></span></p><div class='For' id='683-707'><div class='node-cost' id='708-709'><div id='cc-706' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-706'   line-cost='0' block-cost='44'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-cc-706' class='tooltip cond-tooltop' role='tooltip-cc-706'>Op Count: 0</span><div id='dc-706' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-706'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-706' class='tooltip cond-tooltop' role='tooltip-dc-706'>Bits Moved: 0</span></div><a onclick='return toggle(706);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=706-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=706-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='708-711'><span class='keyword nav-anchor' id='loop-706'>for</span> (</span><b class='variable matched' id='706-714'>conv_y.s0.x.xi.xi</b><span class='matched' id='708-715'>, </span><span class='IntImm Imm' id='708-717'>0</span><span class='matched' id='708-719'>, </span><span class='IntImm Imm' id='708-721'>16</span><span class='matched' id='708-723'>)</span></a><button class='icon-btn sync-btn' onclick='scrollToViz("loop-viz-706")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='708-725'>{</span><div class='indent ForBody' id='706'><div class='Allocate' id='727-730'><div class='node-cost' id='731-732'><div id='cc-728' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-728'   line-cost='0' block-cost='44'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-cc-728' class='tooltip cond-tooltop' role='tooltip-cc-728'>Op Count: 0</span><div id='dc-728' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-728'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-728' class='tooltip cond-tooltop' role='tooltip-dc-728'>Bits Moved: 0</span></div><span class='matched' id='731-734'><span class='keyword nav-anchor' id='allocate-728'>allocate </span><b class='variable matched' id='729-737'>conv_x</b>[</span><span class='Type' id='731-738'>float32</span> * <span class='IntImm Imm' id='731-740'>4</span> * <span class='IntImm Imm' id='731-742'>20</span><span class='matched' id='731-744'>]</span><button class='icon-btn sync-btn' onclick='scrollToViz("allocate-viz-728")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><div class='AllocateBody' id='731-746'><div class='Block' id='747-748'><div class='Produce' id='749-751'><div class='node-cost' id='752-753'><div id='cc-750' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-750'   line-cost='0' block-cost='25'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-cc-750' class='tooltip cond-tooltop' role='tooltip-cc-750'>Op Count: 0</span><div id='dc-750' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-750'   line-cost='0' block-cost='672'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-750' class='tooltip cond-tooltop' role='tooltip-dc-750'>Bits Moved: 0</span></div><a onclick='return toggle(750);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=750-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=750-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='752-755'><span class='keyword nav-anchor' id='prodcons-750'>produce </span><b class='variable matched' id='750-758'>conv_x</b></span></a><button class='icon-btn sync-btn' onclick='scrollToViz("prodcons-viz-750")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='752-759'>{</span><div class='indent ProducerConsumerBody' id='750'><div class='LetStmt' id='761-764'><div class='node-cost' id='765-766'><div id='cc-762' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-762'   line-cost='2' block-cost='25'   line-cost-color='2' block-cost-color='19'></div><span id='tooltip-cc-762' class='tooltip cond-tooltop' role='tooltip-cc-762'>Op Count: 2</span><div id='dc-762' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-762'   line-cost='0' block-cost='672'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-762' class='tooltip cond-tooltop' role='tooltip-dc-762'>Bits Moved: 0</span></div><p class='WrapLine' id='765-768'><span class='cost-highlight' id='cost-bg-762'><span class='matched' id='770-771'><span class='keyword' id='772-773'>let </span><b class='variable matched' id='763-775'>t147</b><span class='Operator Assign' id='772-776'> = </span></span><span class='BinaryOp' id='770-778'><span class='matched' id='779-780'>(</span><span class='BinaryOp' id='779-782'><span class='matched' id='783-784'>(</span><b class='variable matched' id='706-786'>conv_y.s0.x.xi.xi</b> <span class='matched Operator' id='783-787'>*</span> <span class='IntImm Imm' id='783-789'>4</span><span class='matched' id='783-791'>)</span></span> <span class='matched Operator' id='779-793'>+</span> <b class='variable matched' id='681-795'>t145</b><span class='matched' id='779-796'>)</span></span></span></p><div class='For' id='765-799'><div class='node-cost' id='800-801'><div id='cc-798' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-798'   line-cost='0' block-cost='23'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-cc-798' class='tooltip cond-tooltop' role='tooltip-cc-798'>Op Count: 0</span><div id='dc-798' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-798'   line-cost='0' block-cost='672'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-798' class='tooltip cond-tooltop' role='tooltip-dc-798'>Bits Moved: 0</span></div><a onclick='return toggle(798);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=798-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=798-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='800-803'><span class='keyword nav-anchor' id='loop-798'>for</span> (</span><b class='variable matched' id='798-806'>conv_x.s0.y.rebased</b><span class='matched' id='800-807'>, </span><span class='IntImm Imm' id='800-809'>0</span><span class='matched' id='800-811'>, </span><span class='IntImm Imm' id='800-813'>20</span><span class='matched' id='800-815'>)</span></a><button class='icon-btn sync-btn' onclick='scrollToViz("loop-viz-798")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='800-817'>{</span><div class='indent ForBody' id='798'><div class='Allocate' id='819-822'><div class='node-cost' id='823-824'><div id='cc-820' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-820'   line-cost='0' block-cost='23'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-cc-820' class='tooltip cond-tooltop' role='tooltip-cc-820'>Op Count: 0</span><div id='dc-820' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-820'   line-cost='0' block-cost='672'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-820' class='tooltip cond-tooltop' role='tooltip-dc-820'>Bits Moved: 0</span></div><span class='matched' id='823-826'><span class='keyword nav-anchor' id='allocate-820'>allocate </span><b class='variable matched' id='821-829'>conv_x$1</b>[</span><span class='Type' id='823-830'>float32</span> * <span class='IntImm Imm' id='823-832'>4</span><span class='matched' id='823-834'>]</span><button class='icon-btn sync-btn' onclick='scrollToViz("allocate-viz-820")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><div class='AllocateBody' id='823-836'><div class='Block' id='837-838'><div class='Produce' id='839-841'><div class='node-cost' id='842-843'><div id='cc-840' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-840'   line-cost='0' block-cost='20'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-cc-840' class='tooltip cond-tooltop' role='tooltip-cc-840'>Op Count: 0</span><div id='dc-840' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-840'   line-cost='0' block-cost='416'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-840' class='tooltip cond-tooltop' role='tooltip-dc-840'>Bits Moved: 0</span></div><a onclick='return toggle(840);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=840-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=840-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='842-845'><span class='keyword nav-anchor' id='prodcons-840'>produce </span><b class='variable matched' id='840-848'>conv_x$1</b></span></a><button class='icon-btn sync-btn' onclick='scrollToViz("prodcons-viz-840")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='842-849'>{</span><div class='indent ProducerConsumerBody' id='840'><div class='Block' id='851-852'><div class='Store WrapLine' id='853-855'><div class='node-cost' id='856-857'><div id='cc-854' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-854'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-cc-854' class='tooltip cond-tooltop' role='tooltip-cc-854'>Op Count: 2</span><div id='dc-854' class='cost-btn CostColor6'   aria-describedby='tooltip-dc-854'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-dc-854' class='tooltip cond-tooltop' role='tooltip-dc-854'>Bits Moved: 128</span></div><span class='matched' id='856-859'><span class='nav-anchor' id='store-854'><b class='variable matched' id='840-862'>conv_x$1</b>[</span></span><span class='Ramp' id='856-863'><span class='matched' id='864-865'><span class='Symbol matched' id='866-867'>ramp</span>(</span><span class='IntImm Imm' id='864-869'>0</span><span class='matched' id='864-871'>, </span><span class='IntImm Imm' id='864-873'>1</span><span class='matched' id='864-875'>, </span><span class='IntImm Imm' id='864-877'>4</span><span class='matched' id='864-879'>)</span></span><span class='matched' id='856-881'>]</span><span class='Operator Assign Matched' id='856-883'> = </span><span class='StoreValue' id='856-885'><span class='Broadcast' id='886-887'><span class='matched' id='888-889'>x4(</span><span class='FloatImm Imm' id='888-891'>0.000000f</span><span class='matched' id='888-893'>)</span></span></span></div><div class='LetStmt' id='853-897'><div class='node-cost' id='898-899'><div id='cc-895' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-895'   line-cost='1' block-cost='18'   line-cost-color='1' block-cost-color='18'></div><span id='tooltip-cc-895' class='tooltip cond-tooltop' role='tooltip-cc-895'>Op Count: 1</span><div id='dc-895' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-895'   line-cost='0' block-cost='288'   line-cost-color='0' block-cost-color='13'></div><span id='tooltip-dc-895' class='tooltip cond-tooltop' role='tooltip-dc-895'>Bits Moved: 0</span></div><p class='WrapLine' id='898-901'><span class='cost-highlight' id='cost-bg-895'><span class='matched' id='903-904'><span class='keyword' id='905-906'>let </span><b class='variable matched' id='896-908'>t148</b><span class='Operator Assign' id='905-909'> = </span></span><span class='BinaryOp' id='903-911'><span class='matched' id='912-913'>(</span><b class='variable matched' id='798-915'>conv_x.s0.y.rebased</b> <span class='matched Operator' id='912-916'>+</span> <b class='variable matched' id='763-918'>t147</b><span class='matched' id='912-919'>)</span></span></span></p><div class='For' id='898-922'><div class='node-cost' id='923-924'><div id='cc-921' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-921'   line-cost='0' block-cost='17'   line-cost-color='0' block-cost-color='17'></div><span id='tooltip-cc-921' class='tooltip cond-tooltop' role='tooltip-cc-921'>Op Count: 0</span><div id='dc-921' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-921'   line-cost='0' block-cost='288'   line-cost-color='0' block-cost-color='13'></div><span id='tooltip-dc-921' class='tooltip cond-tooltop' role='tooltip-dc-921'>Bits Moved: 0</span></div><a onclick='return toggle(921);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=921-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=921-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='923-926'><span class='keyword nav-anchor' id='loop-921'>for</span> (</span><b class='variable matched' id='921-929'>conv_x$1.s1.k$x</b><span class='matched' id='923-930'>, </span><span class='IntImm Imm' id='923-932'>0</span><span class='matched' id='923-934'>, </span><span class='IntImm Imm' id='923-936'>20</span><span class='matched' id='923-938'>)</span></a><button class='icon-btn sync-btn' onclick='scrollToViz("loop-viz-921")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='923-940'>{</span><div class='indent ForBody' id='921'><div class='Store WrapLine' id='942-944'><div class='node-cost' id='945-946'><div id='cc-943' class='cost-btn CostColor17'   aria-describedby='tooltip-cc-943'   line-cost='17' block-cost='17'   line-cost-color='17' block-cost-color='17'></div><span id='tooltip-cc-943' class='tooltip cond-tooltop' role='tooltip-cc-943'>Op Count: 17</span><div id='dc-943' class='cost-btn CostColor13'   aria-describedby='tooltip-dc-943'   line-cost='288' block-cost='288'   line-cost-color='13' block-cost-color='13'></div><span id='tooltip-dc-943' class='tooltip cond-tooltop' role='tooltip-dc-943'>Bits Moved: 288</span></div><span class='matched' id='945-948'><span class='nav-anchor' id='store-943'><b class='variable matched' id='840-951'>conv_x$1</b>[</span></span><span class='Ramp' id='945-952'><span class='matched' id='953-954'><span class='Symbol matched' id='955-956'>ramp</span>(</span><span class='IntImm Imm' id='953-958'>0</span><span class='matched' id='953-960'>, </span><span class='IntImm Imm' id='953-962'>1</span><span class='matched' id='953-964'>, </span><span class='IntImm Imm' id='953-966'>4</span><span class='matched' id='953-968'>)</span></span><span class='matched' id='945-970'>]</span><span class='Operator Assign Matched' id='945-972'> = </span><span class='StoreValue' id='945-974'><span class='BinaryOp' id='975-976'><span class='matched' id='977-978'>(</span><span class='Load nav-anchor' id='load-980'><span class='matched' id='981-982'><b class='variable matched' id='840-984'>conv_x$1</b>[</span><span class='Ramp' id='981-985'><span class='matched' id='986-987'><span class='Symbol matched' id='988-989'>ramp</span>(</span><span class='IntImm Imm' id='986-991'>0</span><span class='matched' id='986-993'>, </span><span class='IntImm Imm' id='986-995'>1</span><span class='matched' id='986-997'>, </span><span class='IntImm Imm' id='986-999'>4</span><span class='matched' id='986-1001'>)</span></span><span class='matched' id='981-1003'>]</span></span> <span class='matched Operator' id='977-1005'>+</span> <span class='BinaryOp' id='977-1007'><span class='matched' id='1008-1009'>(</span><span class='Cast' id='1008-1011'><span class='matched' id='1012-1013'><span class='Type' id='1014-1015'>float32x4</span>(</span><span class='Ramp' id='1012-1017'><span class='matched' id='1018-1019'><span class='Symbol matched' id='1020-1021'>ramp</span>(</span><span class='BinaryOp' id='1018-1023'><span class='matched' id='1024-1025'>(</span><span class='BinaryOp' id='1024-1027'><span class='matched' id='1028-1029'>(</span><b class='variable matched' id='921-1031'>conv_x$1.s1.k$x</b> <span class='matched Operator' id='1028-1032'>+</span> <b class='variable matched' id='896-1034'>t148</b><span class='matched' id='1028-1035'>)</span></span> <span class='matched Operator' id='1024-1037'>+</span> <span class='IntImm Imm' id='1024-1039'>-20</span><span class='matched' id='1024-1041'>)</span></span><span class='matched' id='1018-1043'>, </span><span class='IntImm Imm' id='1018-1045'>1</span><span class='matched' id='1018-1047'>, </span><span class='IntImm Imm' id='1018-1049'>4</span><span class='matched' id='1018-1051'>)</span></span><span class='matched' id='1012-1053'>)</span></span> <span class='matched Operator' id='1008-1055'>*</span> <span class='Broadcast' id='1008-1057'><span class='matched' id='1058-1059'>x4(</span><span class='Load nav-anchor' id='load-1061'><span class='matched' id='1062-1063'><b class='variable matched' id='142-1065'>kernel</b>[</span><b class='variable matched' id='921-1066'>conv_x$1.s1.k$x</b><span class='matched' id='1062-1067'>]</span></span><span class='matched' id='1058-1069'>)</span></span><span class='matched' id='1008-1071'>)</span></span><span class='matched' id='977-1073'>)</span></span></span></div></div><span class='matched ClosingBrace cb-921' id='923-1075'>}</span></div></div></div></div><span class='matched ClosingBrace cb-840' id='842-1077'>}</span></div><div class='Consumer' id='839-1080'><div class='node-cost' id='1081-1082'><div id='cc-1079' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-1079'   line-cost='0' block-cost='3'   line-cost-color='0' block-cost-color='3'></div><span id='tooltip-cc-1079' class='tooltip cond-tooltop' role='tooltip-cc-1079'>Op Count: 0</span><div id='dc-1079' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1079'   line-cost='0' block-cost='256'   line-cost-color='0' block-cost-color='12'></div><span id='tooltip-dc-1079' class='tooltip cond-tooltop' role='tooltip-dc-1079'>Bits Moved: 0</span></div><a onclick='return toggle(1079);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=1079-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=1079-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='1081-1084'><span class='keyword nav-anchor' id='prodcons-1079'>consume </span><b class='variable matched' id='1079-1087'>conv_x$1</b></span></a><button class='icon-btn sync-btn' onclick='scrollToViz("prodcons-viz-1079")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='1081-1088'>{</span><div class='indent ProducerConsumerBody' id='1079'><div class='Store WrapLine' id='1090-1092'><div class='node-cost' id='1093-1094'><div id='cc-1091' class='cost-btn CostColor3'   aria-describedby='tooltip-cc-1091'   line-cost='3' block-cost='3'   line-cost-color='3' block-cost-color='3'></div><span id='tooltip-cc-1091' class='tooltip cond-tooltop' role='tooltip-cc-1091'>Op Count: 3</span><div id='dc-1091' class='cost-btn CostColor12'   aria-describedby='tooltip-dc-1091'   line-cost='256' block-cost='256'   line-cost-color='12' block-cost-color='12'></div><span id='tooltip-dc-1091' class='tooltip cond-tooltop' role='tooltip-dc-1091'>Bits Moved: 256</span></div><span class='matched' id='1093-1096'><span class='nav-anchor' id='store-1091'><b class='variable matched' id='750-1099'>conv_x</b>[</span></span><span class='Ramp' id='1093-1100'><span class='matched' id='1101-1102'><span class='Symbol matched' id='1103-1104'>ramp</span>(</span><span class='BinaryOp' id='1101-1106'><span class='matched' id='1107-1108'>(</span><b class='variable matched' id='798-1110'>conv_x.s0.y.rebased</b> <span class='matched Operator' id='1107-1111'>*</span> <span class='IntImm Imm' id='1107-1113'>4</span><span class='matched' id='1107-1115'>)</span></span><span class='matched' id='1101-1117'>, </span><span class='IntImm Imm' id='1101-1119'>1</span><span class='matched' id='1101-1121'>, </span><span class='IntImm Imm' id='1101-1123'>4</span><span class='matched' id='1101-1125'>)</span></span><span class='matched' id='1093-1127'>]</span><span class='Operator Assign Matched' id='1093-1129'> = </span><span class='StoreValue' id='1093-1131'><span class='Load nav-anchor' id='load-1133'><span class='matched' id='1134-1135'><b class='variable matched' id='1079-1137'>conv_x$1</b>[</span><span class='Ramp' id='1134-1138'><span class='matched' id='1139-1140'><span class='Symbol matched' id='1141-1142'>ramp</span>(</span><span class='IntImm Imm' id='1139-1144'>0</span><span class='matched' id='1139-1146'>, </span><span class='IntImm Imm' id='1139-1148'>1</span><span class='matched' id='1139-1150'>, </span><span class='IntImm Imm' id='1139-1152'>4</span><span class='matched' id='1139-1154'>)</span></span><span class='matched' id='1134-1156'>]</span></span></span></div></div><span class='matched ClosingBrace cb-1079' id='1081-1158'>}</span></div><div class='Free WrapLine' id='839-1160'><div class='node-cost' id='1161-1163'><div id='cc-1162' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-1162'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-cc-1162' class='tooltip cond-tooltop' role='tooltip-cc-1162'>Op Count: 0</span><div id='dc-1162' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1162'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-1162' class='tooltip cond-tooltop' role='tooltip-dc-1162'>Bits Moved: 0</span></div><span class='keyword' id='1161-1165'>free </span><b class='variable matched' id='821-1167'>conv_x$1</b></div></div></div></div></div><span class='matched ClosingBrace cb-798' id='800-1168'>}</span></div></div></div><span class='matched ClosingBrace cb-750' id='752-1170'>}</span></div><div class='Consumer' id='749-1173'><div class='node-cost' id='1174-1175'><div id='cc-1172' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-1172'   line-cost='0' block-cost='19'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-cc-1172' class='tooltip cond-tooltop' role='tooltip-cc-1172'>Op Count: 0</span><div id='dc-1172' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1172'   line-cost='0' block-cost='800'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-1172' class='tooltip cond-tooltop' role='tooltip-dc-1172'>Bits Moved: 0</span></div><a onclick='return toggle(1172);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=1172-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=1172-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='1174-1177'><span class='keyword nav-anchor' id='prodcons-1172'>consume </span><b class='variable matched' id='1172-1180'>conv_x</b></span></a><button class='icon-btn sync-btn' onclick='scrollToViz("prodcons-viz-1172")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='1174-1181'>{</span><div class='indent ProducerConsumerBody' id='1172'><div class='Allocate' id='1183-1186'><div class='node-cost' id='1187-1188'><div id='cc-1184' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-1184'   line-cost='0' block-cost='19'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-cc-1184' class='tooltip cond-tooltop' role='tooltip-cc-1184'>Op Count: 0</span><div id='dc-1184' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1184'   line-cost='0' block-cost='800'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-1184' class='tooltip cond-tooltop' role='tooltip-dc-1184'>Bits Moved: 0</span></div><span class='matched' id='1187-1190'><span class='keyword nav-anchor' id='allocate-1184'>allocate </span><b class='variable matched' id='1185-1193'>conv_y$1</b>[</span><span class='Type' id='1187-1194'>float32</span> * <span class='IntImm Imm' id='1187-1196'>4</span><span class='matched' id='1187-1198'>]</span><button class='icon-btn sync-btn' onclick='scrollToViz("allocate-viz-1184")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><div class='AllocateBody' id='1187-1200'><div class='Block' id='1201-1202'><div class='Produce' id='1203-1205'><div class='node-cost' id='1206-1207'><div id='cc-1204' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-1204'   line-cost='0' block-cost='15'   line-cost-color='0' block-cost-color='15'></div><span id='tooltip-cc-1204' class='tooltip cond-tooltop' role='tooltip-cc-1204'>Op Count: 0</span><div id='dc-1204' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1204'   line-cost='0' block-cost='544'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-1204' class='tooltip cond-tooltop' role='tooltip-dc-1204'>Bits Moved: 0</span></div><a onclick='return toggle(1204);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=1204-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=1204-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='1206-1209'><span class='keyword nav-anchor' id='prodcons-1204'>produce </span><b class='variable matched' id='1204-1212'>conv_y$1</b></span></a><button class='icon-btn sync-btn' onclick='scrollToViz("prodcons-viz-1204")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='1206-1213'>{</span><div class='indent ProducerConsumerBody' id='1204'><div class='Block' id='1215-1216'><div class='Store WrapLine' id='1217-1219'><div class='node-cost' id='1220-1221'><div id='cc-1218' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-1218'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-cc-1218' class='tooltip cond-tooltop' role='tooltip-cc-1218'>Op Count: 2</span><div id='dc-1218' class='cost-btn CostColor6'   aria-describedby='tooltip-dc-1218'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-dc-1218' class='tooltip cond-tooltop' role='tooltip-dc-1218'>Bits Moved: 128</span></div><span class='matched' id='1220-1223'><span class='nav-anchor' id='store-1218'><b class='variable matched' id='1204-1226'>conv_y$1</b>[</span></span><span class='Ramp' id='1220-1227'><span class='matched' id='1228-1229'><span class='Symbol matched' id='1230-1231'>ramp</span>(</span><span class='IntImm Imm' id='1228-1233'>0</span><span class='matched' id='1228-1235'>, </span><span class='IntImm Imm' id='1228-1237'>1</span><span class='matched' id='1228-1239'>, </span><span class='IntImm Imm' id='1228-1241'>4</span><span class='matched' id='1228-1243'>)</span></span><span class='matched' id='1220-1245'>]</span><span class='Operator Assign Matched' id='1220-1247'> = </span><span class='StoreValue' id='1220-1249'><span class='Broadcast' id='1250-1251'><span class='matched' id='1252-1253'>x4(</span><span class='FloatImm Imm' id='1252-1255'>0.000000f</span><span class='matched' id='1252-1257'>)</span></span></span></div><div class='For' id='1217-1260'><div class='node-cost' id='1261-1262'><div id='cc-1259' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-1259'   line-cost='0' block-cost='13'   line-cost-color='0' block-cost-color='13'></div><span id='tooltip-cc-1259' class='tooltip cond-tooltop' role='tooltip-cc-1259'>Op Count: 0</span><div id='dc-1259' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1259'   line-cost='0' block-cost='416'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-dc-1259' class='tooltip cond-tooltop' role='tooltip-dc-1259'>Bits Moved: 0</span></div><a onclick='return toggle(1259);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=1259-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=1259-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='1261-1264'><span class='keyword nav-anchor' id='loop-1259'>for</span> (</span><b class='variable matched' id='1259-1267'>conv_y$1.s1.k$x</b><span class='matched' id='1261-1268'>, </span><span class='IntImm Imm' id='1261-1270'>0</span><span class='matched' id='1261-1272'>, </span><span class='IntImm Imm' id='1261-1274'>20</span><span class='matched' id='1261-1276'>)</span></a><button class='icon-btn sync-btn' onclick='scrollToViz("loop-viz-1259")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='1261-1278'>{</span><div class='indent ForBody' id='1259'><div class='Store WrapLine' id='1280-1282'><div class='node-cost' id='1283-1284'><div id='cc-1281' class='cost-btn CostColor13'   aria-describedby='tooltip-cc-1281'   line-cost='13' block-cost='13'   line-cost-color='13' block-cost-color='13'></div><span id='tooltip-cc-1281' class='tooltip cond-tooltop' role='tooltip-cc-1281'>Op Count: 13</span><div id='dc-1281' class='cost-btn CostColor19'   aria-describedby='tooltip-dc-1281'   line-cost='416' block-cost='416'   line-cost-color='19' block-cost-color='19'></div><span id='tooltip-dc-1281' class='tooltip cond-tooltop' role='tooltip-dc-1281'>Bits Moved: 416</span></div><span class='matched' id='1283-1286'><span class='nav-anchor' id='store-1281'><b class='variable matched' id='1204-1289'>conv_y$1</b>[</span></span><span class='Ramp' id='1283-1290'><span class='matched' id='1291-1292'><span class='Symbol matched' id='1293-1294'>ramp</span>(</span><span class='IntImm Imm' id='1291-1296'>0</span><span class='matched' id='1291-1298'>, </span><span class='IntImm Imm' id='1291-1300'>1</span><span class='matched' id='1291-1302'>, </span><span class='IntImm Imm' id='1291-1304'>4</span><span class='matched' id='1291-1306'>)</span></span><span class='matched' id='1283-1308'>]</span><span class='Operator Assign Matched' id='1283-1310'> = </span><span class='StoreValue' id='1283-1312'><span class='BinaryOp' id='1313-1314'><span class='matched' id='1315-1316'>(</span><span class='Load nav-anchor' id='load-1318'><span class='matched' id='1319-1320'><b class='variable matched' id='1204-1322'>conv_y$1</b>[</span><span class='Ramp' id='1319-1323'><span class='matched' id='1324-1325'><span class='Symbol matched' id='1326-1327'>ramp</span>(</span><span class='IntImm Imm' id='1324-1329'>0</span><span class='matched' id='1324-1331'>, </span><span class='IntImm Imm' id='1324-1333'>1</span><span class='matched' id='1324-1335'>, </span><span class='IntImm Imm' id='1324-1337'>4</span><span class='matched' id='1324-1339'>)</span></span><span class='matched' id='1319-1341'>]</span></span> <span class='matched Operator' id='1315-1343'>+</span> <span class='BinaryOp' id='1315-1345'><span class='matched' id='1346-1347'>(</span><span class='Load nav-anchor' id='load-1349'><span class='matched' id='1350-1351'><b class='variable matched' id='1172-1353'>conv_x</b>[</span><span class='Ramp' id='1350-1354'><span class='matched' id='1355-1356'><span class='Symbol matched' id='1357-1358'>ramp</span>(</span><span class='BinaryOp' id='1355-1360'><span class='matched' id='1361-1362'>(</span><b class='variable matched' id='1259-1364'>conv_y$1.s1.k$x</b> <span class='matched Operator' id='1361-1365'>*</span> <span class='IntImm Imm' id='1361-1367'>4</span><span class='matched' id='1361-1369'>)</span></span><span class='matched' id='1355-1371'>, </span><span class='IntImm Imm' id='1355-1373'>1</span><span class='matched' id='1355-1375'>, </span><span class='IntImm Imm' id='1355-1377'>4</span><span class='matched' id='1355-1379'>)</span></span><span class='matched' id='1350-1381'>]</span></span> <span class='matched Operator' id='1346-1383'>*</span> <span class='Broadcast' id='1346-1385'><span class='matched' id='1386-1387'>x4(</span><span class='Load nav-anchor' id='load-1389'><span class='matched' id='1390-1391'><b class='variable matched' id='142-1393'>kernel</b>[</span><b class='variable matched' id='1259-1394'>conv_y$1.s1.k$x</b><span class='matched' id='1390-1395'>]</span></span><span class='matched' id='1386-1397'>)</span></span><span class='matched' id='1346-1399'>)</span></span><span class='matched' id='1315-1401'>)</span></span></span></div></div><span class='matched ClosingBrace cb-1259' id='1261-1403'>}</span></div><div class='Free WrapLine' id='1217-1405'><div class='node-cost' id='1406-1408'><div id='cc-1407' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-1407'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-cc-1407' class='tooltip cond-tooltop' role='tooltip-cc-1407'>Op Count: 0</span><div id='dc-1407' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1407'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-1407' class='tooltip cond-tooltop' role='tooltip-dc-1407'>Bits Moved: 0</span></div><span class='keyword' id='1406-1410'>free </span><b class='variable matched' id='1172-1412'>conv_x</b></div></div></div><span class='matched ClosingBrace cb-1204' id='1206-1413'>}</span></div><div class='Consumer' id='1203-1416'><div class='node-cost' id='1417-1418'><div id='cc-1415' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-1415'   line-cost='0' block-cost='4'   line-cost-color='0' block-cost-color='4'></div><span id='tooltip-cc-1415' class='tooltip cond-tooltop' role='tooltip-cc-1415'>Op Count: 0</span><div id='dc-1415' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1415'   line-cost='0' block-cost='256'   line-cost-color='0' block-cost-color='12'></div><span id='tooltip-dc-1415' class='tooltip cond-tooltop' role='tooltip-dc-1415'>Bits Moved: 0</span></div><a onclick='return toggle(1415);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=1415-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=1415-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='1417-1420'><span class='keyword nav-anchor' id='prodcons-1415'>consume </span><b class='variable matched' id='1415-1423'>conv_y$1</b></span></a><button class='icon-btn sync-btn' onclick='scrollToViz("prodcons-viz-1415")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='1417-1424'>{</span><div class='indent ProducerConsumerBody' id='1415'><div class='Store WrapLine' id='1426-1428'><div class='node-cost' id='1429-1430'><div id='cc-1427' class='cost-btn CostColor4'   aria-describedby='tooltip-cc-1427'   line-cost='4' block-cost='4'   line-cost-color='4' block-cost-color='4'></div><span id='tooltip-cc-1427' class='tooltip cond-tooltop' role='tooltip-cc-1427'>Op Count: 4</span><div id='dc-1427' class='cost-btn CostColor12'   aria-describedby='tooltip-dc-1427'   line-cost='256' block-cost='256'   line-cost-color='12' block-cost-color='12'></div><span id='tooltip-dc-1427' class='tooltip cond-tooltop' role='tooltip-dc-1427'>Bits Moved: 256</span></div><span class='matched' id='1429-1432'><span class='nav-anchor' id='store-1427'><b class='variable matched' id='108-1435'>conv_y</b>[</span></span><span class='Ramp' id='1429-1436'><span class='matched' id='1437-1438'><span class='Symbol matched' id='1439-1440'>ramp</span>(</span><span class='BinaryOp' id='1437-1442'><span class='matched' id='1443-1444'>(</span><span class='BinaryOp' id='1443-1446'><span class='matched' id='1447-1448'>(</span><b class='variable matched' id='706-1450'>conv_y.s0.x.xi.xi</b> <span class='matched Operator' id='1447-1451'>*</span> <span class='IntImm Imm' id='1447-1453'>4</span><span class='matched' id='1447-1455'>)</span></span> <span class='matched Operator' id='1443-1457'>+</span> <b class='variable matched' id='637-1459'>t146</b><span class='matched' id='1443-1460'>)</span></span><span class='matched' id='1437-1462'>, </span><span class='IntImm Imm' id='1437-1464'>1</span><span class='matched' id='1437-1466'>, </span><span class='IntImm Imm' id='1437-1468'>4</span><span class='matched' id='1437-1470'>)</span></span><span class='matched' id='1429-1472'>]</span><span class='Operator Assign Matched' id='1429-1474'> = </span><span class='StoreValue' id='1429-1476'><span class='Load nav-anchor' id='load-1478'><span class='matched' id='1479-1480'><b class='variable matched' id='1415-1482'>conv_y$1</b>[</span><span class='Ramp' id='1479-1483'><span class='matched' id='1484-1485'><span class='Symbol matched' id='1486-1487'>ramp</span>(</span><span class='IntImm Imm' id='1484-1489'>0</span><span class='matched' id='1484-1491'>, </span><span class='IntImm Imm' id='1484-1493'>1</span><span class='matched' id='1484-1495'>, </span><span class='IntImm Imm' id='1484-1497'>4</span><span class='matched' id='1484-1499'>)</span></span><span class='matched' id='1479-1501'>]</span></span></span></div></div><span class='matched ClosingBrace cb-1415' id='1417-1503'>}</span></div><div class='Free WrapLine' id='1203-1505'><div class='node-cost' id='1506-1508'><div id='cc-1507' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-1507'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-cc-1507' class='tooltip cond-tooltop' role='tooltip-cc-1507'>Op Count: 0</span><div id='dc-1507' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1507'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-1507' class='tooltip cond-tooltop' role='tooltip-dc-1507'>Bits Moved: 0</span></div><span class='keyword' id='1506-1510'>free </span><b class='variable matched' id='1185-1512'>conv_y$1</b></div></div></div></div></div><span class='matched ClosingBrace cb-1172' id='1174-1513'>}</span></div></div></div></div></div><span class='matched ClosingBrace cb-706' id='708-1515'>}</span></div></div></div></div><span class='matched ClosingBrace cb-614' id='616-1517'>}</span></div></div></div></div></div></div></div></div></div></div></div></div></div></div></div></div></div><span class='matched ClosingBrace cb-10' id='12-1519'>}</span></div><div class='Function' id='9-1522'><a onclick='return toggle(1521);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=1521-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=1521-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='1523-1524'><span class='keyword nav-anchor' id='lowered-func-conv_y'>func </span>conv_y(</span><b class='variable matched' id='1521-1527'>conv_y</b><span class='matched' id='1523-1528'>)</span></a><button class='icon-btn sync-btn' onclick='scrollToViz("lowered-func-viz-1521")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='1523-1530'>{</span><div class='indent FunctionBody' id='1521'><div class='Block' id='1532-1533'><div class='AssertStmt WrapLine' id='1534-1535'><div class='node-cost' id='1536-1538'><div id='cc-1537' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-1537'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-cc-1537' class='tooltip cond-tooltop' role='tooltip-cc-1537'>Op Count: 2</span><div id='dc-1537' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1537'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-1537' class='tooltip cond-tooltop' role='tooltip-dc-1537'>Bits Moved: 0</span></div><span class='matched' id='1536-1540'><span class='Symbol matched' id='1541-1542'>assert</span>(</span><span class='BinaryOp' id='1536-1544'><span class='matched' id='1545-1546'>(</span><span class='Reinterpret' id='1545-1548'><span class='matched' id='1549-1550'><span class='Type' id='1551-1552'>uint64</span>(</span><b class='variable matched' id='1554-1555'>conv_y.buffer</b><span class='matched' id='1549-1556'>)</span></span> <span class='matched Operator' id='1545-1558'>!=</span> <span class='UIntImm Imm' id='1545-1560'>(uint64)0</span><span class='matched' id='1545-1562'>)</span></span><span class='matched' id='1536-1564'>, </span><span class='Call' id='1536-1567'><span class='nav-anchor' id='fn-call-1566'><span class='matched' id='1569-1570'><span class='Symbol matched' id='1571-1572'>halide_error_buffer_argument_is_null</span>(</span><span class='StringImm Imm' id='1569-1574'>"conv_y"</span><span class='matched' id='1569-1576'>)</span></span></span><span class='matched' id='1536-1578'>)</span></div><div class='LetStmt' id='1534-1582'><div class='node-cost' id='1583-1584'><div id='cc-1580' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-1580'   line-cost='1' block-cost='125'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-1580' class='tooltip cond-tooltop' role='tooltip-cc-1580'>Op Count: 1</span><div id='dc-1580' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1580'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1580' class='tooltip cond-tooltop' role='tooltip-dc-1580'>Bits Moved: 0</span></div><p class='WrapLine' id='1583-1586'><span class='cost-highlight' id='cost-bg-1580'><span class='matched' id='1588-1589'><span class='keyword' id='1590-1591'>let </span><b class='variable matched' id='1581-1593'>conv_y</b><span class='Operator Assign' id='1590-1594'> = </span></span><span class='Call' id='1588-1597'><span class='nav-anchor' id='fn-call-1596'><span class='matched' id='1599-1600'><span class='Symbol matched' id='1601-1602'>_halide_buffer_get_host</span>(</span><b class='variable matched' id='1554-1604'>conv_y.buffer</b><span class='matched' id='1599-1605'>)</span></span></span></span></p><div class='LetStmt' id='1583-1609'><div class='node-cost' id='1610-1611'><div id='cc-1607' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-1607'   line-cost='1' block-cost='124'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-1607' class='tooltip cond-tooltop' role='tooltip-cc-1607'>Op Count: 1</span><div id='dc-1607' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1607'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1607' class='tooltip cond-tooltop' role='tooltip-dc-1607'>Bits Moved: 0</span></div><p class='WrapLine' id='1610-1613'><span class='cost-highlight' id='cost-bg-1607'><span class='matched' id='1615-1616'><span class='keyword' id='1617-1618'>let </span><b class='variable matched' id='1608-1620'>conv_y.type</b><span class='Operator Assign' id='1617-1621'> = </span></span><span class='Call' id='1615-1624'><span class='nav-anchor' id='fn-call-1623'><span class='matched' id='1626-1627'><span class='Symbol matched' id='1628-1629'>_halide_buffer_get_type</span>(</span><b class='variable matched' id='1554-1631'>conv_y.buffer</b><span class='matched' id='1626-1632'>)</span></span></span></span></p><div class='LetStmt' id='1610-1636'><div class='node-cost' id='1637-1638'><div id='cc-1634' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-1634'   line-cost='1' block-cost='123'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-1634' class='tooltip cond-tooltop' role='tooltip-cc-1634'>Op Count: 1</span><div id='dc-1634' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1634'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1634' class='tooltip cond-tooltop' role='tooltip-dc-1634'>Bits Moved: 0</span></div><p class='WrapLine' id='1637-1640'><span class='cost-highlight' id='cost-bg-1634'><span class='matched' id='1642-1643'><span class='keyword' id='1644-1645'>let </span><b class='variable matched' id='1635-1647'>conv_y.device_dirty</b><span class='Operator Assign' id='1644-1648'> = </span></span><span class='Call' id='1642-1651'><span class='nav-anchor' id='fn-call-1650'><span class='matched' id='1653-1654'><span class='Symbol matched' id='1655-1656'>_halide_buffer_get_device_dirty</span>(</span><b class='variable matched' id='1554-1658'>conv_y.buffer</b><span class='matched' id='1653-1659'>)</span></span></span></span></p><div class='LetStmt' id='1637-1663'><div class='node-cost' id='1664-1665'><div id='cc-1661' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-1661'   line-cost='1' block-cost='122'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-1661' class='tooltip cond-tooltop' role='tooltip-cc-1661'>Op Count: 1</span><div id='dc-1661' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1661'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1661' class='tooltip cond-tooltop' role='tooltip-dc-1661'>Bits Moved: 0</span></div><p class='WrapLine' id='1664-1667'><span class='cost-highlight' id='cost-bg-1661'><span class='matched' id='1669-1670'><span class='keyword' id='1671-1672'>let </span><b class='variable matched' id='1662-1674'>conv_y.dimensions</b><span class='Operator Assign' id='1671-1675'> = </span></span><span class='Call' id='1669-1678'><span class='nav-anchor' id='fn-call-1677'><span class='matched' id='1680-1681'><span class='Symbol matched' id='1682-1683'>_halide_buffer_get_dimensions</span>(</span><b class='variable matched' id='1554-1685'>conv_y.buffer</b><span class='matched' id='1680-1686'>)</span></span></span></span></p><div class='LetStmt' id='1664-1690'><div class='node-cost' id='1691-1692'><div id='cc-1688' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-1688'   line-cost='1' block-cost='121'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-1688' class='tooltip cond-tooltop' role='tooltip-cc-1688'>Op Count: 1</span><div id='dc-1688' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1688'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1688' class='tooltip cond-tooltop' role='tooltip-dc-1688'>Bits Moved: 0</span></div><p class='WrapLine' id='1691-1694'><span class='cost-highlight' id='cost-bg-1688'><span class='matched' id='1696-1697'><span class='keyword' id='1698-1699'>let </span><b class='variable matched' id='1689-1701'>conv_y.min.0</b><span class='Operator Assign' id='1698-1702'> = </span></span><span class='Call' id='1696-1705'><span class='nav-anchor' id='fn-call-1704'><span class='matched' id='1707-1708'><span class='Symbol matched' id='1709-1710'>_halide_buffer_get_min</span>(</span><b class='variable matched' id='1554-1712'>conv_y.buffer</b><span class='matched' id='1707-1713'>, </span><span class='IntImm Imm' id='1707-1715'>0</span><span class='matched' id='1707-1717'>)</span></span></span></span></p><div class='LetStmt' id='1691-1721'><div class='node-cost' id='1722-1723'><div id='cc-1719' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-1719'   line-cost='1' block-cost='120'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-1719' class='tooltip cond-tooltop' role='tooltip-cc-1719'>Op Count: 1</span><div id='dc-1719' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1719'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1719' class='tooltip cond-tooltop' role='tooltip-dc-1719'>Bits Moved: 0</span></div><p class='WrapLine' id='1722-1725'><span class='cost-highlight' id='cost-bg-1719'><span class='matched' id='1727-1728'><span class='keyword' id='1729-1730'>let </span><b class='variable matched' id='1720-1732'>conv_y.extent.0</b><span class='Operator Assign' id='1729-1733'> = </span></span><span class='Call' id='1727-1736'><span class='nav-anchor' id='fn-call-1735'><span class='matched' id='1738-1739'><span class='Symbol matched' id='1740-1741'>_halide_buffer_get_extent</span>(</span><b class='variable matched' id='1554-1743'>conv_y.buffer</b><span class='matched' id='1738-1744'>, </span><span class='IntImm Imm' id='1738-1746'>0</span><span class='matched' id='1738-1748'>)</span></span></span></span></p><div class='LetStmt' id='1722-1752'><div class='node-cost' id='1753-1754'><div id='cc-1750' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-1750'   line-cost='1' block-cost='119'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-1750' class='tooltip cond-tooltop' role='tooltip-cc-1750'>Op Count: 1</span><div id='dc-1750' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1750'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1750' class='tooltip cond-tooltop' role='tooltip-dc-1750'>Bits Moved: 0</span></div><p class='WrapLine' id='1753-1756'><span class='cost-highlight' id='cost-bg-1750'><span class='matched' id='1758-1759'><span class='keyword' id='1760-1761'>let </span><b class='variable matched' id='1751-1763'>conv_y.stride.0</b><span class='Operator Assign' id='1760-1764'> = </span></span><span class='Call' id='1758-1767'><span class='nav-anchor' id='fn-call-1766'><span class='matched' id='1769-1770'><span class='Symbol matched' id='1771-1772'>_halide_buffer_get_stride</span>(</span><b class='variable matched' id='1554-1774'>conv_y.buffer</b><span class='matched' id='1769-1775'>, </span><span class='IntImm Imm' id='1769-1777'>0</span><span class='matched' id='1769-1779'>)</span></span></span></span></p><div class='LetStmt' id='1753-1783'><div class='node-cost' id='1784-1785'><div id='cc-1781' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-1781'   line-cost='1' block-cost='118'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-1781' class='tooltip cond-tooltop' role='tooltip-cc-1781'>Op Count: 1</span><div id='dc-1781' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1781'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1781' class='tooltip cond-tooltop' role='tooltip-dc-1781'>Bits Moved: 0</span></div><p class='WrapLine' id='1784-1787'><span class='cost-highlight' id='cost-bg-1781'><span class='matched' id='1789-1790'><span class='keyword' id='1791-1792'>let </span><b class='variable matched' id='1782-1794'>conv_y.min.1</b><span class='Operator Assign' id='1791-1795'> = </span></span><span class='Call' id='1789-1798'><span class='nav-anchor' id='fn-call-1797'><span class='matched' id='1800-1801'><span class='Symbol matched' id='1802-1803'>_halide_buffer_get_min</span>(</span><b class='variable matched' id='1554-1805'>conv_y.buffer</b><span class='matched' id='1800-1806'>, </span><span class='IntImm Imm' id='1800-1808'>1</span><span class='matched' id='1800-1810'>)</span></span></span></span></p><div class='LetStmt' id='1784-1814'><div class='node-cost' id='1815-1816'><div id='cc-1812' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-1812'   line-cost='1' block-cost='117'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-1812' class='tooltip cond-tooltop' role='tooltip-cc-1812'>Op Count: 1</span><div id='dc-1812' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1812'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1812' class='tooltip cond-tooltop' role='tooltip-dc-1812'>Bits Moved: 0</span></div><p class='WrapLine' id='1815-1818'><span class='cost-highlight' id='cost-bg-1812'><span class='matched' id='1820-1821'><span class='keyword' id='1822-1823'>let </span><b class='variable matched' id='1813-1825'>conv_y.extent.1</b><span class='Operator Assign' id='1822-1826'> = </span></span><span class='Call' id='1820-1829'><span class='nav-anchor' id='fn-call-1828'><span class='matched' id='1831-1832'><span class='Symbol matched' id='1833-1834'>_halide_buffer_get_extent</span>(</span><b class='variable matched' id='1554-1836'>conv_y.buffer</b><span class='matched' id='1831-1837'>, </span><span class='IntImm Imm' id='1831-1839'>1</span><span class='matched' id='1831-1841'>)</span></span></span></span></p><div class='LetStmt' id='1815-1845'><div class='node-cost' id='1846-1847'><div id='cc-1843' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-1843'   line-cost='1' block-cost='116'   line-cost-color='1' block-cost-color='19'></div><span id='tooltip-cc-1843' class='tooltip cond-tooltop' role='tooltip-cc-1843'>Op Count: 1</span><div id='dc-1843' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1843'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1843' class='tooltip cond-tooltop' role='tooltip-dc-1843'>Bits Moved: 0</span></div><p class='WrapLine' id='1846-1849'><span class='cost-highlight' id='cost-bg-1843'><span class='matched' id='1851-1852'><span class='keyword' id='1853-1854'>let </span><b class='variable matched' id='1844-1856'>conv_y.stride.1</b><span class='Operator Assign' id='1853-1857'> = </span></span><span class='Call' id='1851-1860'><span class='nav-anchor' id='fn-call-1859'><span class='matched' id='1862-1863'><span class='Symbol matched' id='1864-1865'>_halide_buffer_get_stride</span>(</span><b class='variable matched' id='1554-1867'>conv_y.buffer</b><span class='matched' id='1862-1868'>, </span><span class='IntImm Imm' id='1862-1870'>1</span><span class='matched' id='1862-1872'>)</span></span></span></span></p><div class='LetStmt' id='1846-1876'><div class='node-cost' id='1877-1878'><div id='cc-1874' class='cost-btn CostColor8'   aria-describedby='tooltip-cc-1874'   line-cost='8' block-cost='115'   line-cost-color='8' block-cost-color='19'></div><span id='tooltip-cc-1874' class='tooltip cond-tooltop' role='tooltip-cc-1874'>Op Count: 8</span><div id='dc-1874' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1874'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1874' class='tooltip cond-tooltop' role='tooltip-dc-1874'>Bits Moved: 0</span></div><p class='WrapLine' id='1877-1880'><span class='cost-highlight' id='cost-bg-1874'><span class='matched' id='1882-1883'><span class='keyword' id='1884-1885'>let </span><b class='variable matched' id='1875-1887'>conv_y.extent.0.required.s</b><span class='Operator Assign' id='1884-1888'> = </span></span><span class='Min' id='1882-1890'><span class='matched' id='1891-1892'><span class='Symbol matched' id='1893-1894'>min</span>(</span><span class='BinaryOp' id='1891-1896'><span class='matched' id='1897-1898'>(</span><span class='Max' id='1897-1900'><span class='matched' id='1901-1902'><span class='Symbol matched' id='1903-1904'>max</span>(</span><span class='BinaryOp' id='1901-1906'><span class='matched' id='1907-1908'>(</span><span class='Max' id='1907-1910'><span class='matched' id='1911-1912'><span class='Symbol matched' id='1913-1914'>max</span>(</span><span class='BinaryOp' id='1911-1916'><span class='matched' id='1917-1918'>(</span><span class='IntImm Imm' id='1917-1920'>-64</span> <span class='matched Operator' id='1917-1922'>-</span> <b class='variable matched' id='1720-1924'>conv_y.extent.0</b><span class='matched' id='1917-1925'>)</span></span><span class='matched' id='1911-1927'>, </span><span class='BinaryOp' id='1911-1929'><span class='matched' id='1930-1931'>(</span><b class='variable matched' id='1720-1933'>conv_y.extent.0</b> <span class='matched Operator' id='1930-1934'>+</span> <span class='IntImm Imm' id='1930-1936'>-1</span><span class='matched' id='1930-1938'>)</span></span><span class='matched' id='1911-1940'>)</span></span> <span class='matched Operator' id='1907-1942'>/</span> <span class='IntImm Imm' id='1907-1944'>64</span><span class='matched' id='1907-1946'>)</span></span><span class='matched' id='1901-1948'>, </span><span class='IntImm Imm' id='1901-1950'>0</span><span class='matched' id='1901-1952'>)</span></span> <span class='matched Operator' id='1897-1954'>*</span> <span class='IntImm Imm' id='1897-1956'>64</span><span class='matched' id='1897-1958'>)</span></span><span class='matched' id='1891-1960'>, </span><span class='BinaryOp' id='1891-1962'><span class='matched' id='1963-1964'>(</span><b class='variable matched' id='1720-1966'>conv_y.extent.0</b> <span class='matched Operator' id='1963-1967'>+</span> <span class='IntImm Imm' id='1963-1969'>-64</span><span class='matched' id='1963-1971'>)</span></span><span class='matched' id='1891-1973'>)</span></span></span></p><div class='LetStmt' id='1877-1977'><div class='node-cost' id='1978-1979'><div id='cc-1975' class='cost-btn CostColor19'   aria-describedby='tooltip-cc-1975'   line-cost='19' block-cost='107'   line-cost-color='19' block-cost-color='19'></div><span id='tooltip-cc-1975' class='tooltip cond-tooltop' role='tooltip-cc-1975'>Op Count: 19</span><div id='dc-1975' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-1975'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-1975' class='tooltip cond-tooltop' role='tooltip-dc-1975'>Bits Moved: 0</span></div><p class='WrapLine' id='1978-1981'><span class='cost-highlight' id='cost-bg-1975'><span class='matched' id='1983-1984'><span class='keyword' id='1985-1986'>let </span><b class='variable matched' id='1976-1988'>conv_y.extent.1.required.s</b><span class='Operator Assign' id='1985-1989'> = </span></span><span class='Let' id='1983-1992'><span class='matched' id='1993-1994'>(<span class='keyword' id='1995-1996'>let </span><b class='variable matched' id='1991-1998'>t149</b><span class='Operator Assign' id='1995-1999'> = </span></span><span class='BinaryOp' id='1993-2001'><span class='matched' id='2002-2003'>(</span><span class='IntImm Imm' id='2002-2005'>0</span> <span class='matched Operator' id='2002-2007'>&lt;</span> <b class='variable matched' id='1720-2009'>conv_y.extent.0</b><span class='matched' id='2002-2010'>)</span></span><span class='matched keyword' id='1993-2012'> in </span><span class='Let' id='1993-2015'><span class='matched' id='2016-2017'>(<span class='keyword' id='2018-2019'>let </span><b class='variable matched' id='2014-2021'>t150</b><span class='Operator Assign' id='2018-2022'> = </span></span><span class='BinaryOp' id='2016-2024'><span class='matched' id='2025-2026'>(</span><span class='BinaryOp' id='2025-2028'><span class='matched' id='2029-2030'>(</span><span class='BinaryOp' id='2029-2032'><span class='matched' id='2033-2034'>(</span><span class='BinaryOp' id='2033-2036'><span class='matched' id='2037-2038'>(</span><span class='BinaryOp' id='2037-2040'><span class='matched' id='2041-2042'>(</span><b class='variable matched' id='1720-2044'>conv_y.extent.0</b> <span class='matched Operator' id='2041-2045'>+</span> <span class='IntImm Imm' id='2041-2047'>63</span><span class='matched' id='2041-2049'>)</span></span> <span class='matched Operator' id='2037-2051'>/</span> <span class='IntImm Imm' id='2037-2053'>64</span><span class='matched' id='2037-2055'>)</span></span> <span class='matched Operator' id='2033-2057'>*</span> <span class='BinaryOp' id='2033-2059'><span class='matched' id='2060-2061'>(</span><span class='BinaryOp' id='2060-2063'><span class='matched' id='2064-2065'>(</span><b class='variable matched' id='1813-2067'>conv_y.extent.1</b> <span class='matched Operator' id='2064-2068'>+</span> <span class='IntImm Imm' id='2064-2070'>63</span><span class='matched' id='2064-2072'>)</span></span> <span class='matched Operator' id='2060-2074'>/</span> <span class='IntImm Imm' id='2060-2076'>64</span><span class='matched' id='2060-2078'>)</span></span><span class='matched' id='2033-2080'>)</span></span> <span class='matched Operator' id='2029-2082'>+</span> <span class='IntImm Imm' id='2029-2084'>-1</span><span class='matched' id='2029-2086'>)</span></span> <span class='matched Operator' id='2025-2088'>/</span> <span class='BinaryOp' id='2025-2090'><span class='matched' id='2091-2092'>(</span><span class='BinaryOp' id='2091-2094'><span class='matched' id='2095-2096'>(</span><b class='variable matched' id='1720-2098'>conv_y.extent.0</b> <span class='matched Operator' id='2095-2099'>+</span> <span class='IntImm Imm' id='2095-2101'>63</span><span class='matched' id='2095-2103'>)</span></span> <span class='matched Operator' id='2091-2105'>/</span> <span class='IntImm Imm' id='2091-2107'>64</span><span class='matched' id='2091-2109'>)</span></span><span class='matched' id='2025-2111'>)</span></span><span class='matched keyword' id='2016-2113'> in </span><span class='BinaryOp' id='2016-2115'><span class='matched' id='2116-2117'>(</span><span class='Min' id='2116-2119'><span class='matched' id='2120-2121'><span class='Symbol matched' id='2122-2123'>min</span>(</span><span class='BinaryOp' id='2120-2125'><span class='matched' id='2126-2127'>(</span><span class='Select' id='2126-2129'><span class='matched' id='2130-2131'><span class='Symbol matched' id='2132-2133'>select</span>(</span><b class='variable matched' id='1991-2135'>t149</b><span class='matched' id='2130-2136'>, </span><b class='variable matched' id='2014-2138'>t150</b><span class='matched' id='2130-2139'>, </span><span class='IntImm Imm' id='2130-2141'>0</span><span class='matched' id='2130-2143'>)</span></span> <span class='matched Operator' id='2126-2145'>*</span> <span class='IntImm Imm' id='2126-2147'>64</span><span class='matched' id='2126-2149'>)</span></span><span class='matched' id='2120-2151'>, </span><span class='BinaryOp' id='2120-2153'><span class='matched' id='2154-2155'>(</span><b class='variable matched' id='1813-2157'>conv_y.extent.1</b> <span class='matched Operator' id='2154-2158'>+</span> <span class='IntImm Imm' id='2154-2160'>-64</span><span class='matched' id='2154-2162'>)</span></span><span class='matched' id='2120-2164'>)</span></span> <span class='matched Operator' id='2116-2166'>-</span> <span class='Min' id='2116-2168'><span class='matched' id='2169-2170'><span class='Symbol matched' id='2171-2172'>min</span>(</span><span class='BinaryOp' id='2169-2174'><span class='matched' id='2175-2176'>(</span><span class='Select' id='2175-2178'><span class='matched' id='2179-2180'><span class='Symbol matched' id='2181-2182'>select</span>(</span><b class='variable matched' id='1991-2184'>t149</b><span class='matched' id='2179-2185'>, </span><span class='IntImm Imm' id='2179-2187'>0</span><span class='matched' id='2179-2189'>, </span><b class='variable matched' id='2014-2191'>t150</b><span class='matched' id='2179-2192'>)</span></span> <span class='matched Operator' id='2175-2194'>*</span> <span class='IntImm Imm' id='2175-2196'>64</span><span class='matched' id='2175-2198'>)</span></span><span class='matched' id='2169-2200'>, </span><span class='BinaryOp' id='2169-2202'><span class='matched' id='2203-2204'>(</span><b class='variable matched' id='1813-2206'>conv_y.extent.1</b> <span class='matched Operator' id='2203-2207'>+</span> <span class='IntImm Imm' id='2203-2209'>-64</span><span class='matched' id='2203-2211'>)</span></span><span class='matched' id='2169-2213'>)</span></span><span class='matched' id='2116-2215'>)</span></span><span class='matched' id='2016-2217'>)</span></span><span class='matched' id='1993-2219'>)</span></span></span></p><div class='LetStmt' id='1978-2223'><div class='node-cost' id='2224-2225'><div id='cc-2221' class='cost-btn CostColor14'   aria-describedby='tooltip-cc-2221'   line-cost='14' block-cost='88'   line-cost-color='14' block-cost-color='19'></div><span id='tooltip-cc-2221' class='tooltip cond-tooltop' role='tooltip-cc-2221'>Op Count: 14</span><div id='dc-2221' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-2221'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-2221' class='tooltip cond-tooltop' role='tooltip-dc-2221'>Bits Moved: 0</span></div><p class='WrapLine' id='2224-2227'><span class='cost-highlight' id='cost-bg-2221'><span class='matched' id='2229-2230'><span class='keyword' id='2231-2232'>let </span><b class='variable matched' id='2222-2234'>conv_y.min.1.required.s</b><span class='Operator Assign' id='2231-2235'> = </span></span><span class='Min' id='2229-2237'><span class='matched' id='2238-2239'><span class='Symbol matched' id='2240-2241'>min</span>(</span><span class='BinaryOp' id='2238-2243'><span class='matched' id='2244-2245'>(</span><span class='Select' id='2244-2247'><span class='matched' id='2248-2249'><span class='Symbol matched' id='2250-2251'>select</span>(</span><span class='BinaryOp' id='2248-2253'><span class='matched' id='2254-2255'>(</span><span class='IntImm Imm' id='2254-2257'>0</span> <span class='matched Operator' id='2254-2259'>&lt;</span> <b class='variable matched' id='1720-2261'>conv_y.extent.0</b><span class='matched' id='2254-2262'>)</span></span><span class='matched' id='2248-2264'>, </span><span class='IntImm Imm' id='2248-2266'>0</span><span class='matched' id='2248-2268'>, </span><span class='BinaryOp' id='2248-2270'><span class='matched' id='2271-2272'>(</span><span class='BinaryOp' id='2271-2274'><span class='matched' id='2275-2276'>(</span><span class='BinaryOp' id='2275-2278'><span class='matched' id='2279-2280'>(</span><span class='BinaryOp' id='2279-2282'><span class='matched' id='2283-2284'>(</span><span class='BinaryOp' id='2283-2286'><span class='matched' id='2287-2288'>(</span><b class='variable matched' id='1720-2290'>conv_y.extent.0</b> <span class='matched Operator' id='2287-2291'>+</span> <span class='IntImm Imm' id='2287-2293'>63</span><span class='matched' id='2287-2295'>)</span></span> <span class='matched Operator' id='2283-2297'>/</span> <span class='IntImm Imm' id='2283-2299'>64</span><span class='matched' id='2283-2301'>)</span></span> <span class='matched Operator' id='2279-2303'>*</span> <span class='BinaryOp' id='2279-2305'><span class='matched' id='2306-2307'>(</span><span class='BinaryOp' id='2306-2309'><span class='matched' id='2310-2311'>(</span><b class='variable matched' id='1813-2313'>conv_y.extent.1</b> <span class='matched Operator' id='2310-2314'>+</span> <span class='IntImm Imm' id='2310-2316'>63</span><span class='matched' id='2310-2318'>)</span></span> <span class='matched Operator' id='2306-2320'>/</span> <span class='IntImm Imm' id='2306-2322'>64</span><span class='matched' id='2306-2324'>)</span></span><span class='matched' id='2279-2326'>)</span></span> <span class='matched Operator' id='2275-2328'>+</span> <span class='IntImm Imm' id='2275-2330'>-1</span><span class='matched' id='2275-2332'>)</span></span> <span class='matched Operator' id='2271-2334'>/</span> <span class='BinaryOp' id='2271-2336'><span class='matched' id='2337-2338'>(</span><span class='BinaryOp' id='2337-2340'><span class='matched' id='2341-2342'>(</span><b class='variable matched' id='1720-2344'>conv_y.extent.0</b> <span class='matched Operator' id='2341-2345'>+</span> <span class='IntImm Imm' id='2341-2347'>63</span><span class='matched' id='2341-2349'>)</span></span> <span class='matched Operator' id='2337-2351'>/</span> <span class='IntImm Imm' id='2337-2353'>64</span><span class='matched' id='2337-2355'>)</span></span><span class='matched' id='2271-2357'>)</span></span><span class='matched' id='2248-2359'>)</span></span> <span class='matched Operator' id='2244-2361'>*</span> <span class='IntImm Imm' id='2244-2363'>64</span><span class='matched' id='2244-2365'>)</span></span><span class='matched' id='2238-2367'>, </span><span class='BinaryOp' id='2238-2369'><span class='matched' id='2370-2371'>(</span><b class='variable matched' id='1813-2373'>conv_y.extent.1</b> <span class='matched Operator' id='2370-2374'>+</span> <span class='IntImm Imm' id='2370-2376'>-64</span><span class='matched' id='2370-2378'>)</span></span><span class='matched' id='2238-2380'>)</span></span></span></p><div class='Block' id='2224-2382'><div class='IfThenElse' id='2383-2386'><div class='node-cost' id='2387-2388'><div id='cc-2384' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-2384'   line-cost='2' block-cost='13'   line-cost-color='2' block-cost-color='13'></div><span id='tooltip-cc-2384' class='tooltip cond-tooltop' role='tooltip-cc-2384'>Op Count: 2</span><div id='dc-2384' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-2384'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-2384' class='tooltip cond-tooltop' role='tooltip-dc-2384'>Bits Moved: 0</span></div><a onclick='return toggle(2384);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=2384-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=2384-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='2387-2390'><span class='keyword nav-anchor IfSpan' id='cond-2385'>if</span> (</span><span class='Call' id='2387-2394'><span class='nav-anchor' id='fn-call-2393'><span class='matched' id='2396-2397'><span class='Symbol matched' id='2398-2399'>_halide_buffer_is_bounds_query</span>(</span><b class='variable matched' id='1554-2401'>conv_y.buffer</b><span class='matched' id='2396-2402'>)</span></span></span><span class='matched' id='2387-2404'>)</span></a><button class='icon-btn sync-btn' onclick='scrollToViz("cond-viz-2385")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='2387-2406'> {</span><div class='indent ThenBody' id='2384'><div class='Block' id='2408-2409'><div class='node-cost' id='2410-2411'><div id='cc-2385' class='cost-btn CostColor11'   aria-describedby='tooltip-cc-2385'   line-cost='11' block-cost='11'   line-cost-color='11' block-cost-color='11'></div><span id='tooltip-cc-2385' class='tooltip cond-tooltop' role='tooltip-cc-2385'>Op Count: 11</span><div id='dc-2385' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-2385'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-2385' class='tooltip cond-tooltop' role='tooltip-dc-2385'>Bits Moved: 0</span></div><span class='Let' id='2410-2414'><span class='matched' id='2415-2416'>(<span class='keyword' id='2417-2418'>let </span><b class='variable matched' id='2413-2420'>t151</b><span class='Operator Assign' id='2417-2421'> = </span></span><span class='Max' id='2415-2423'><span class='matched' id='2424-2425'><span class='Symbol matched' id='2426-2427'>max</span>(</span><b class='variable matched' id='1875-2429'>conv_y.extent.0.required.s</b><span class='matched' id='2424-2430'>, </span><span class='IntImm Imm' id='2424-2432'>0</span><span class='matched' id='2424-2434'>)</span></span><span class='matched keyword' id='2415-2436'> in </span><span class='Call' id='2415-2439'><span class='nav-anchor' id='fn-call-2438'><span class='matched' id='2441-2442'><span class='Symbol matched' id='2443-2444'>_halide_buffer_init</span>(</span><b class='variable matched' id='1554-2446'>conv_y.buffer</b><span class='matched' id='2441-2447'>, </span><span class='Call' id='2441-2450'><span class='nav-anchor' id='fn-call-2449'><span class='matched' id='2452-2453'><span class='Symbol matched' id='2454-2455'>_halide_buffer_get_shape</span>(</span><b class='variable matched' id='1554-2457'>conv_y.buffer</b><span class='matched' id='2452-2458'>)</span></span></span><span class='matched' id='2441-2460'>, </span><span class='Reinterpret' id='2441-2462'><span class='matched' id='2463-2464'><span class='Type' id='2465-2466'>(void *)</span>(</span><span class='UIntImm Imm' id='2463-2468'>(uint64)0</span><span class='matched' id='2463-2470'>)</span></span><span class='matched' id='2441-2472'>, </span><span class='UIntImm Imm' id='2441-2474'>(uint64)0</span><span class='matched' id='2441-2476'>, </span><span class='Reinterpret' id='2441-2478'><span class='matched' id='2479-2480'><span class='Type' id='2481-2482'>(struct halide_device_interface_t *)</span>(</span><span class='UIntImm Imm' id='2479-2484'>(uint64)0</span><span class='matched' id='2479-2486'>)</span></span><span class='matched' id='2441-2488'>, </span><span class='IntImm Imm' id='2441-2490'>2</span><span class='matched' id='2441-2492'>, </span><span class='IntImm Imm' id='2441-2494'>32</span><span class='matched' id='2441-2496'>, </span><span class='IntImm Imm' id='2441-2498'>2</span><span class='matched' id='2441-2500'>, </span><span class='Call' id='2441-2503'><span class='nav-anchor' id='fn-call-2502'><span class='matched' id='2505-2506'><span class='Symbol matched' id='2507-2508'>make_struct</span>(</span><span class='BinaryOp' id='2505-2510'><span class='matched' id='2511-2512'>(</span><span class='BinaryOp' id='2511-2514'><span class='matched' id='2515-2516'>(</span><span class='Min' id='2515-2518'><span class='matched' id='2519-2520'><span class='Symbol matched' id='2521-2522'>min</span>(</span><b class='variable matched' id='1720-2524'>conv_y.extent.0</b><span class='matched' id='2519-2525'>, </span><span class='IntImm Imm' id='2519-2527'>64</span><span class='matched' id='2519-2529'>)</span></span> <span class='matched Operator' id='2515-2531'>+</span> <b class='variable matched' id='1689-2533'>conv_y.min.0</b><span class='matched' id='2515-2534'>)</span></span> <span class='matched Operator' id='2511-2536'>+</span> <span class='IntImm Imm' id='2511-2538'>-64</span><span class='matched' id='2511-2540'>)</span></span><span class='matched' id='2505-2542'>, </span><span class='BinaryOp' id='2505-2544'><span class='matched' id='2545-2546'>(</span><b class='variable matched' id='2413-2548'>t151</b> <span class='matched Operator' id='2545-2549'>+</span> <span class='IntImm Imm' id='2545-2551'>64</span><span class='matched' id='2545-2553'>)</span></span><span class='matched' id='2505-2555'>, </span><span class='IntImm Imm' id='2505-2557'>1</span><span class='matched' id='2505-2559'>, </span><span class='IntImm Imm' id='2505-2561'>0</span><span class='matched' id='2505-2563'>, </span><span class='BinaryOp' id='2505-2565'><span class='matched' id='2566-2567'>(</span><b class='variable matched' id='1782-2569'>conv_y.min.1</b> <span class='matched Operator' id='2566-2570'>+</span> <b class='variable matched' id='2222-2572'>conv_y.min.1.required.s</b><span class='matched' id='2566-2573'>)</span></span><span class='matched' id='2505-2575'>, </span><span class='BinaryOp' id='2505-2577'><span class='matched' id='2578-2579'>(</span><b class='variable matched' id='1976-2581'>conv_y.extent.1.required.s</b> <span class='matched Operator' id='2578-2582'>+</span> <span class='IntImm Imm' id='2578-2584'>64</span><span class='matched' id='2578-2586'>)</span></span><span class='matched' id='2505-2588'>, </span><span class='BinaryOp' id='2505-2590'><span class='matched' id='2591-2592'>(</span><b class='variable matched' id='2413-2594'>t151</b> <span class='matched Operator' id='2591-2595'>+</span> <span class='IntImm Imm' id='2591-2597'>64</span><span class='matched' id='2591-2599'>)</span></span><span class='matched' id='2505-2601'>, </span><span class='IntImm Imm' id='2505-2603'>0</span><span class='matched' id='2505-2605'>)</span></span></span><span class='matched' id='2441-2607'>, </span><span class='UIntImm Imm' id='2441-2609'>(uint64)0</span><span class='matched' id='2441-2611'>)</span></span></span><span class='matched' id='2415-2613'>)</span></span></div></div><span class='matched ClosingBrace cb-2384' id='2387-2615'>}</span></div><div class='IfThenElse' id='2383-2619'><div class='node-cost' id='2620-2621'><div id='cc-2617' class='cost-btn CostColor3'   aria-describedby='tooltip-cc-2617'   line-cost='3' block-cost='61'   line-cost-color='3' block-cost-color='19'></div><span id='tooltip-cc-2617' class='tooltip cond-tooltop' role='tooltip-cc-2617'>Op Count: 3</span><div id='dc-2617' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-2617'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-2617' class='tooltip cond-tooltop' role='tooltip-dc-2617'>Bits Moved: 0</span></div><a onclick='return toggle(2617);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=2617-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=2617-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='2620-2623'><span class='keyword nav-anchor IfSpan' id='cond-2618'>if</span> (</span><span class='Not' id='2620-2626'>!<span class='Call' id='2627-2628'><span class='nav-anchor' id='fn-call-2393'><span class='matched' id='2630-2631'><span class='Symbol matched' id='2632-2633'>_halide_buffer_is_bounds_query</span>(</span><b class='variable matched' id='1554-2635'>conv_y.buffer</b><span class='matched' id='2630-2636'>)</span></span></span></span><span class='matched' id='2620-2638'>)</span></a><button class='icon-btn sync-btn' onclick='scrollToViz("cond-viz-2618")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='2620-2640'> {</span><div class='indent ThenBody' id='2617'><div class='Block' id='2642-2643'><div class='AssertStmt WrapLine' id='2644-2645'><div class='node-cost' id='2646-2648'><div id='cc-2647' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-2647'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-cc-2647' class='tooltip cond-tooltop' role='tooltip-cc-2647'>Op Count: 2</span><div id='dc-2647' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-2647'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-2647' class='tooltip cond-tooltop' role='tooltip-dc-2647'>Bits Moved: 0</span></div><span class='matched' id='2646-2650'><span class='Symbol matched' id='2651-2652'>assert</span>(</span><span class='BinaryOp' id='2646-2654'><span class='matched' id='2655-2656'>(</span><b class='variable matched' id='1608-2658'>conv_y.type</b> <span class='matched Operator' id='2655-2659'>==</span> <span class='UIntImm Imm' id='2655-2661'>(uint32)73730</span><span class='matched' id='2655-2663'>)</span></span><span class='matched' id='2646-2665'>, </span><span class='Call' id='2646-2668'><span class='nav-anchor' id='fn-call-2667'><span class='matched' id='2670-2671'><span class='Symbol matched' id='2672-2673'>halide_error_bad_type</span>(</span><span class='StringImm Imm' id='2670-2675'>"Output buffer conv_y"</span><span class='matched' id='2670-2677'>, </span><b class='variable matched' id='1608-2679'>conv_y.type</b><span class='matched' id='2670-2680'>, </span><span class='UIntImm Imm' id='2670-2682'>(uint32)73730</span><span class='matched' id='2670-2684'>)</span></span></span><span class='matched' id='2646-2686'>)</span></div><div class='AssertStmt WrapLine' id='2644-2688'><div class='node-cost' id='2689-2691'><div id='cc-2690' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-2690'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-cc-2690' class='tooltip cond-tooltop' role='tooltip-cc-2690'>Op Count: 2</span><div id='dc-2690' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-2690'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-2690' class='tooltip cond-tooltop' role='tooltip-dc-2690'>Bits Moved: 0</span></div><span class='matched' id='2689-2693'><span class='Symbol matched' id='2694-2695'>assert</span>(</span><span class='BinaryOp' id='2689-2697'><span class='matched' id='2698-2699'>(</span><b class='variable matched' id='1662-2701'>conv_y.dimensions</b> <span class='matched Operator' id='2698-2702'>==</span> <span class='IntImm Imm' id='2698-2704'>2</span><span class='matched' id='2698-2706'>)</span></span><span class='matched' id='2689-2708'>, </span><span class='Call' id='2689-2711'><span class='nav-anchor' id='fn-call-2710'><span class='matched' id='2713-2714'><span class='Symbol matched' id='2715-2716'>halide_error_bad_dimensions</span>(</span><span class='StringImm Imm' id='2713-2718'>"Output buffer conv_y"</span><span class='matched' id='2713-2720'>, </span><b class='variable matched' id='1662-2722'>conv_y.dimensions</b><span class='matched' id='2713-2723'>, </span><span class='IntImm Imm' id='2713-2725'>2</span><span class='matched' id='2713-2727'>)</span></span></span><span class='matched' id='2689-2729'>)</span></div><div class='AssertStmt WrapLine' id='2644-2731'><div class='node-cost' id='2732-2734'><div id='cc-2733' class='cost-btn CostColor6'   aria-describedby='tooltip-cc-2733'   line-cost='6' block-cost='6'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-cc-2733' class='tooltip cond-tooltop' role='tooltip-cc-2733'>Op Count: 6</span><div id='dc-2733' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-2733'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-2733' class='tooltip cond-tooltop' role='tooltip-dc-2733'>Bits Moved: 0</span></div><span class='matched' id='2732-2736'><span class='Symbol matched' id='2737-2738'>assert</span>(</span><span class='BinaryOp' id='2732-2740'><span class='matched' id='2741-2742'>(</span><span class='Max' id='2741-2744'><span class='matched' id='2745-2746'><span class='Symbol matched' id='2747-2748'>max</span>(</span><span class='BinaryOp' id='2745-2750'><span class='matched' id='2751-2752'>(</span><span class='Max' id='2751-2754'><span class='matched' id='2755-2756'><span class='Symbol matched' id='2757-2758'>max</span>(</span><b class='variable matched' id='1875-2760'>conv_y.extent.0.required.s</b><span class='matched' id='2755-2761'>, </span><span class='IntImm Imm' id='2755-2763'>0</span><span class='matched' id='2755-2765'>)</span></span> <span class='matched Operator' id='2751-2767'>+</span> <span class='Min' id='2751-2769'><span class='matched' id='2770-2771'><span class='Symbol matched' id='2772-2773'>min</span>(</span><b class='variable matched' id='1720-2775'>conv_y.extent.0</b><span class='matched' id='2770-2776'>, </span><span class='IntImm Imm' id='2770-2778'>64</span><span class='matched' id='2770-2780'>)</span></span><span class='matched' id='2751-2782'>)</span></span><span class='matched' id='2745-2784'>, </span><span class='IntImm Imm' id='2745-2786'>64</span><span class='matched' id='2745-2788'>)</span></span> <span class='matched Operator' id='2741-2790'>&lt=</span> <b class='variable matched' id='1720-2792'>conv_y.extent.0</b><span class='matched' id='2741-2793'>)</span></span><span class='matched' id='2732-2795'>, </span><span class='Let' id='2732-2798'><span class='matched' id='2799-2800'>(<span class='keyword' id='2801-2802'>let </span><b class='variable matched' id='2797-2804'>t152</b><span class='Operator Assign' id='2801-2805'> = </span></span><span class='Min' id='2799-2807'><span class='matched' id='2808-2809'><span class='Symbol matched' id='2810-2811'>min</span>(</span><b class='variable matched' id='1720-2813'>conv_y.extent.0</b><span class='matched' id='2808-2814'>, </span><span class='IntImm Imm' id='2808-2816'>64</span><span class='matched' id='2808-2818'>)</span></span><span class='matched keyword' id='2799-2820'> in </span><span class='Call' id='2799-2823'><span class='nav-anchor' id='fn-call-2822'><span class='matched' id='2825-2826'><span class='Symbol matched' id='2827-2828'>halide_error_access_out_of_bounds</span>(</span><span class='StringImm Imm' id='2825-2830'>"Output buffer conv_y"</span><span class='matched' id='2825-2832'>, </span><span class='IntImm Imm' id='2825-2834'>0</span><span class='matched' id='2825-2836'>, </span><span class='BinaryOp' id='2825-2838'><span class='matched' id='2839-2840'>(</span><span class='BinaryOp' id='2839-2842'><span class='matched' id='2843-2844'>(</span><b class='variable matched' id='2797-2846'>t152</b> <span class='matched Operator' id='2843-2847'>+</span> <b class='variable matched' id='1689-2849'>conv_y.min.0</b><span class='matched' id='2843-2850'>)</span></span> <span class='matched Operator' id='2839-2852'>+</span> <span class='IntImm Imm' id='2839-2854'>-64</span><span class='matched' id='2839-2856'>)</span></span><span class='matched' id='2825-2858'>, </span><span class='BinaryOp' id='2825-2860'><span class='matched' id='2861-2862'>(</span><span class='BinaryOp' id='2861-2864'><span class='matched' id='2865-2866'>(</span><span class='BinaryOp' id='2865-2868'><span class='matched' id='2869-2870'>(</span><span class='Max' id='2869-2872'><span class='matched' id='2873-2874'><span class='Symbol matched' id='2875-2876'>max</span>(</span><b class='variable matched' id='1875-2878'>conv_y.extent.0.required.s</b><span class='matched' id='2873-2879'>, </span><span class='IntImm Imm' id='2873-2881'>0</span><span class='matched' id='2873-2883'>)</span></span> <span class='matched Operator' id='2869-2885'>+</span> <b class='variable matched' id='2797-2887'>t152</b><span class='matched' id='2869-2888'>)</span></span> <span class='matched Operator' id='2865-2890'>+</span> <b class='variable matched' id='1689-2892'>conv_y.min.0</b><span class='matched' id='2865-2893'>)</span></span> <span class='matched Operator' id='2861-2895'>+</span> <span class='IntImm Imm' id='2861-2897'>-1</span><span class='matched' id='2861-2899'>)</span></span><span class='matched' id='2825-2901'>, </span><b class='variable matched' id='1689-2903'>conv_y.min.0</b><span class='matched' id='2825-2904'>, </span><span class='BinaryOp' id='2825-2906'><span class='matched' id='2907-2908'>(</span><span class='BinaryOp' id='2907-2910'><span class='matched' id='2911-2912'>(</span><b class='variable matched' id='1720-2914'>conv_y.extent.0</b> <span class='matched Operator' id='2911-2915'>+</span> <b class='variable matched' id='1689-2917'>conv_y.min.0</b><span class='matched' id='2911-2918'>)</span></span> <span class='matched Operator' id='2907-2920'>+</span> <span class='IntImm Imm' id='2907-2922'>-1</span><span class='matched' id='2907-2924'>)</span></span><span class='matched' id='2825-2926'>)</span></span></span><span class='matched' id='2799-2928'>)</span></span><span class='matched' id='2732-2930'>)</span></div><div class='AssertStmt WrapLine' id='2644-2932'><div class='node-cost' id='2933-2935'><div id='cc-2934' class='cost-btn CostColor8'   aria-describedby='tooltip-cc-2934'   line-cost='8' block-cost='8'   line-cost-color='8' block-cost-color='8'></div><span id='tooltip-cc-2934' class='tooltip cond-tooltop' role='tooltip-cc-2934'>Op Count: 8</span><div id='dc-2934' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-2934'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-2934' class='tooltip cond-tooltop' role='tooltip-dc-2934'>Bits Moved: 0</span></div><span class='matched' id='2933-2937'><span class='Symbol matched' id='2938-2939'>assert</span>(</span><span class='BinaryOp' id='2933-2941'><span class='matched' id='2942-2943'>(</span><span class='BinaryOp' id='2942-2945'><span class='matched' id='2946-2947'>(</span><span class='IntImm Imm' id='2946-2949'>0</span> <span class='matched Operator' id='2946-2951'>&lt=</span> <b class='variable matched' id='2222-2953'>conv_y.min.1.required.s</b><span class='matched' id='2946-2954'>)</span></span> <span class='matched Operator' id='2942-2956'>&amp;&amp;</span> <span class='BinaryOp' id='2942-2958'><span class='matched' id='2959-2960'>(</span><span class='BinaryOp' id='2959-2962'><span class='matched' id='2963-2964'>(</span><span class='BinaryOp' id='2963-2966'><span class='matched' id='2967-2968'>(</span><span class='BinaryOp' id='2967-2970'><span class='matched' id='2971-2972'>(</span><b class='variable matched' id='1782-2974'>conv_y.min.1</b> <span class='matched Operator' id='2971-2975'>+</span> <b class='variable matched' id='2222-2977'>conv_y.min.1.required.s</b><span class='matched' id='2971-2978'>)</span></span> <span class='matched Operator' id='2967-2980'>+</span> <b class='variable matched' id='1976-2982'>conv_y.extent.1.required.s</b><span class='matched' id='2967-2983'>)</span></span> <span class='matched Operator' id='2963-2985'>+</span> <span class='IntImm Imm' id='2963-2987'>64</span><span class='matched' id='2963-2989'>)</span></span> <span class='matched Operator' id='2959-2991'>&lt=</span> <span class='BinaryOp' id='2959-2993'><span class='matched' id='2994-2995'>(</span><b class='variable matched' id='1813-2997'>conv_y.extent.1</b> <span class='matched Operator' id='2994-2998'>+</span> <b class='variable matched' id='1782-3000'>conv_y.min.1</b><span class='matched' id='2994-3001'>)</span></span><span class='matched' id='2959-3003'>)</span></span><span class='matched' id='2942-3005'>)</span></span><span class='matched' id='2933-3007'>, </span><span class='Let' id='2933-3010'><span class='matched' id='3011-3012'>(<span class='keyword' id='3013-3014'>let </span><b class='variable matched' id='3009-3016'>t153</b><span class='Operator Assign' id='3013-3017'> = </span></span><span class='BinaryOp' id='3011-3019'><span class='matched' id='3020-3021'>(</span><b class='variable matched' id='1782-3023'>conv_y.min.1</b> <span class='matched Operator' id='3020-3024'>+</span> <b class='variable matched' id='2222-3026'>conv_y.min.1.required.s</b><span class='matched' id='3020-3027'>)</span></span><span class='matched keyword' id='3011-3029'> in </span><span class='Call' id='3011-3032'><span class='nav-anchor' id='fn-call-3031'><span class='matched' id='3034-3035'><span class='Symbol matched' id='3036-3037'>halide_error_access_out_of_bounds</span>(</span><span class='StringImm Imm' id='3034-3039'>"Output buffer conv_y"</span><span class='matched' id='3034-3041'>, </span><span class='IntImm Imm' id='3034-3043'>1</span><span class='matched' id='3034-3045'>, </span><b class='variable matched' id='3009-3047'>t153</b><span class='matched' id='3034-3048'>, </span><span class='BinaryOp' id='3034-3050'><span class='matched' id='3051-3052'>(</span><span class='BinaryOp' id='3051-3054'><span class='matched' id='3055-3056'>(</span><b class='variable matched' id='3009-3058'>t153</b> <span class='matched Operator' id='3055-3059'>+</span> <b class='variable matched' id='1976-3061'>conv_y.extent.1.required.s</b><span class='matched' id='3055-3062'>)</span></span> <span class='matched Operator' id='3051-3064'>+</span> <span class='IntImm Imm' id='3051-3066'>63</span><span class='matched' id='3051-3068'>)</span></span><span class='matched' id='3034-3070'>, </span><b class='variable matched' id='1782-3072'>conv_y.min.1</b><span class='matched' id='3034-3073'>, </span><span class='BinaryOp' id='3034-3075'><span class='matched' id='3076-3077'>(</span><span class='BinaryOp' id='3076-3079'><span class='matched' id='3080-3081'>(</span><b class='variable matched' id='1813-3083'>conv_y.extent.1</b> <span class='matched Operator' id='3080-3084'>+</span> <b class='variable matched' id='1782-3086'>conv_y.min.1</b><span class='matched' id='3080-3087'>)</span></span> <span class='matched Operator' id='3076-3089'>+</span> <span class='IntImm Imm' id='3076-3091'>-1</span><span class='matched' id='3076-3093'>)</span></span><span class='matched' id='3034-3095'>)</span></span></span><span class='matched' id='3011-3097'>)</span></span><span class='matched' id='2933-3099'>)</span></div><div class='AssertStmt WrapLine' id='2644-3101'><div class='node-cost' id='3102-3104'><div id='cc-3103' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-3103'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-cc-3103' class='tooltip cond-tooltop' role='tooltip-cc-3103'>Op Count: 2</span><div id='dc-3103' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3103'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3103' class='tooltip cond-tooltop' role='tooltip-dc-3103'>Bits Moved: 0</span></div><span class='matched' id='3102-3106'><span class='Symbol matched' id='3107-3108'>assert</span>(</span><span class='BinaryOp' id='3102-3110'><span class='matched' id='3111-3112'>(</span><span class='IntImm Imm' id='3111-3114'>0</span> <span class='matched Operator' id='3111-3116'>&lt=</span> <b class='variable matched' id='1813-3118'>conv_y.extent.1</b><span class='matched' id='3111-3119'>)</span></span><span class='matched' id='3102-3121'>, </span><span class='Call' id='3102-3124'><span class='nav-anchor' id='fn-call-3123'><span class='matched' id='3126-3127'><span class='Symbol matched' id='3128-3129'>halide_error_buffer_extents_negative</span>(</span><span class='StringImm Imm' id='3126-3131'>"Output buffer conv_y"</span><span class='matched' id='3126-3133'>, </span><span class='IntImm Imm' id='3126-3135'>1</span><span class='matched' id='3126-3137'>, </span><b class='variable matched' id='1813-3139'>conv_y.extent.1</b><span class='matched' id='3126-3140'>)</span></span></span><span class='matched' id='3102-3142'>)</span></div><div class='AssertStmt WrapLine' id='2644-3144'><div class='node-cost' id='3145-3147'><div id='cc-3146' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-3146'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-cc-3146' class='tooltip cond-tooltop' role='tooltip-cc-3146'>Op Count: 2</span><div id='dc-3146' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3146'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3146' class='tooltip cond-tooltop' role='tooltip-dc-3146'>Bits Moved: 0</span></div><span class='matched' id='3145-3149'><span class='Symbol matched' id='3150-3151'>assert</span>(</span><span class='BinaryOp' id='3145-3153'><span class='matched' id='3154-3155'>(</span><b class='variable matched' id='1751-3157'>conv_y.stride.0</b> <span class='matched Operator' id='3154-3158'>==</span> <span class='IntImm Imm' id='3154-3160'>1</span><span class='matched' id='3154-3162'>)</span></span><span class='matched' id='3145-3164'>, </span><span class='Call' id='3145-3167'><span class='nav-anchor' id='fn-call-3166'><span class='matched' id='3169-3170'><span class='Symbol matched' id='3171-3172'>halide_error_constraint_violated</span>(</span><span class='StringImm Imm' id='3169-3174'>"conv_y.stride.0"</span><span class='matched' id='3169-3176'>, </span><b class='variable matched' id='1751-3178'>conv_y.stride.0</b><span class='matched' id='3169-3179'>, </span><span class='StringImm Imm' id='3169-3181'>"1"</span><span class='matched' id='3169-3183'>, </span><span class='IntImm Imm' id='3169-3185'>1</span><span class='matched' id='3169-3187'>)</span></span></span><span class='matched' id='3145-3189'>)</span></div><div class='LetStmt' id='2644-3193'><div class='node-cost' id='3194-3195'><div id='cc-3191' class='cost-btn CostColor3'   aria-describedby='tooltip-cc-3191'   line-cost='3' block-cost='36'   line-cost-color='3' block-cost-color='19'></div><span id='tooltip-cc-3191' class='tooltip cond-tooltop' role='tooltip-cc-3191'>Op Count: 3</span><div id='dc-3191' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3191'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-3191' class='tooltip cond-tooltop' role='tooltip-dc-3191'>Bits Moved: 0</span></div><p class='WrapLine' id='3194-3197'><span class='cost-highlight' id='cost-bg-3191'><span class='matched' id='3199-3200'><span class='keyword' id='3201-3202'>let </span><b class='variable matched' id='3192-3204'>conv_y.total_extent.1</b><span class='Operator Assign' id='3201-3205'> = </span></span><span class='BinaryOp' id='3199-3207'><span class='matched' id='3208-3209'>(</span><span class='Cast' id='3208-3211'><span class='matched' id='3212-3213'><span class='Type' id='3214-3215'>int64</span>(</span><b class='variable matched' id='1813-3217'>conv_y.extent.1</b><span class='matched' id='3212-3218'>)</span></span> <span class='matched Operator' id='3208-3220'>*</span> <span class='Cast' id='3208-3222'><span class='matched' id='3223-3224'><span class='Type' id='3225-3226'>int64</span>(</span><b class='variable matched' id='1720-3228'>conv_y.extent.0</b><span class='matched' id='3223-3229'>)</span></span><span class='matched' id='3208-3231'>)</span></span></span></p><div class='Block' id='3194-3233'><div class='AssertStmt WrapLine' id='3234-3235'><div class='node-cost' id='3236-3238'><div id='cc-3237' class='cost-btn CostColor3'   aria-describedby='tooltip-cc-3237'   line-cost='3' block-cost='3'   line-cost-color='3' block-cost-color='3'></div><span id='tooltip-cc-3237' class='tooltip cond-tooltop' role='tooltip-cc-3237'>Op Count: 3</span><div id='dc-3237' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3237'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3237' class='tooltip cond-tooltop' role='tooltip-dc-3237'>Bits Moved: 0</span></div><span class='matched' id='3236-3240'><span class='Symbol matched' id='3241-3242'>assert</span>(</span><span class='BinaryOp' id='3236-3244'><span class='matched' id='3245-3246'>(</span><span class='Cast' id='3245-3248'><span class='matched' id='3249-3250'><span class='Type' id='3251-3252'>uint64</span>(</span><b class='variable matched' id='1720-3254'>conv_y.extent.0</b><span class='matched' id='3249-3255'>)</span></span> <span class='matched Operator' id='3245-3257'>&lt=</span> <span class='UIntImm Imm' id='3245-3259'>(uint64)2147483647</span><span class='matched' id='3245-3261'>)</span></span><span class='matched' id='3236-3263'>, </span><span class='Call' id='3236-3266'><span class='nav-anchor' id='fn-call-3265'><span class='matched' id='3268-3269'><span class='Symbol matched' id='3270-3271'>halide_error_buffer_allocation_too_large</span>(</span><span class='StringImm Imm' id='3268-3273'>"conv_y"</span><span class='matched' id='3268-3275'>, </span><span class='Cast' id='3268-3277'><span class='matched' id='3278-3279'><span class='Type' id='3280-3281'>uint64</span>(</span><b class='variable matched' id='1720-3283'>conv_y.extent.0</b><span class='matched' id='3278-3284'>)</span></span><span class='matched' id='3268-3286'>, </span><span class='UIntImm Imm' id='3268-3288'>(uint64)2147483647</span><span class='matched' id='3268-3290'>)</span></span></span><span class='matched' id='3236-3292'>)</span></div><div class='AssertStmt WrapLine' id='3234-3294'><div class='node-cost' id='3295-3297'><div id='cc-3296' class='cost-btn CostColor6'   aria-describedby='tooltip-cc-3296'   line-cost='6' block-cost='6'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-cc-3296' class='tooltip cond-tooltop' role='tooltip-cc-3296'>Op Count: 6</span><div id='dc-3296' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3296'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3296' class='tooltip cond-tooltop' role='tooltip-dc-3296'>Bits Moved: 0</span></div><span class='matched' id='3295-3299'><span class='Symbol matched' id='3300-3301'>assert</span>(</span><span class='BinaryOp' id='3295-3303'><span class='matched' id='3304-3305'>(</span><span class='Call' id='3304-3308'><span class='nav-anchor' id='fn-call-3307'><span class='matched' id='3310-3311'><span class='Symbol matched' id='3312-3313'>abs</span>(</span><span class='BinaryOp' id='3310-3315'><span class='matched' id='3316-3317'>(</span><span class='Cast' id='3316-3319'><span class='matched' id='3320-3321'><span class='Type' id='3322-3323'>int64</span>(</span><b class='variable matched' id='1813-3325'>conv_y.extent.1</b><span class='matched' id='3320-3326'>)</span></span> <span class='matched Operator' id='3316-3328'>*</span> <span class='Cast' id='3316-3330'><span class='matched' id='3331-3332'><span class='Type' id='3333-3334'>int64</span>(</span><b class='variable matched' id='1844-3336'>conv_y.stride.1</b><span class='matched' id='3331-3337'>)</span></span><span class='matched' id='3316-3339'>)</span></span><span class='matched' id='3310-3341'>)</span></span></span> <span class='matched Operator' id='3304-3343'>&lt=</span> <span class='UIntImm Imm' id='3304-3345'>(uint64)2147483647</span><span class='matched' id='3304-3347'>)</span></span><span class='matched' id='3295-3349'>, </span><span class='Call' id='3295-3352'><span class='nav-anchor' id='fn-call-3351'><span class='matched' id='3354-3355'><span class='Symbol matched' id='3356-3357'>halide_error_buffer_allocation_too_large</span>(</span><span class='StringImm Imm' id='3354-3359'>"conv_y"</span><span class='matched' id='3354-3361'>, </span><span class='Call' id='3354-3363'><span class='nav-anchor' id='fn-call-3307'><span class='matched' id='3365-3366'><span class='Symbol matched' id='3367-3368'>abs</span>(</span><span class='BinaryOp' id='3365-3370'><span class='matched' id='3371-3372'>(</span><span class='Cast' id='3371-3374'><span class='matched' id='3375-3376'><span class='Type' id='3377-3378'>int64</span>(</span><b class='variable matched' id='1813-3380'>conv_y.extent.1</b><span class='matched' id='3375-3381'>)</span></span> <span class='matched Operator' id='3371-3383'>*</span> <span class='Cast' id='3371-3385'><span class='matched' id='3386-3387'><span class='Type' id='3388-3389'>int64</span>(</span><b class='variable matched' id='1844-3391'>conv_y.stride.1</b><span class='matched' id='3386-3392'>)</span></span><span class='matched' id='3371-3394'>)</span></span><span class='matched' id='3365-3396'>)</span></span></span><span class='matched' id='3354-3398'>, </span><span class='UIntImm Imm' id='3354-3400'>(uint64)2147483647</span><span class='matched' id='3354-3402'>)</span></span></span><span class='matched' id='3295-3404'>)</span></div><div class='AssertStmt WrapLine' id='3234-3406'><div class='node-cost' id='3407-3409'><div id='cc-3408' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-3408'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-cc-3408' class='tooltip cond-tooltop' role='tooltip-cc-3408'>Op Count: 2</span><div id='dc-3408' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3408'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3408' class='tooltip cond-tooltop' role='tooltip-dc-3408'>Bits Moved: 0</span></div><span class='matched' id='3407-3411'><span class='Symbol matched' id='3412-3413'>assert</span>(</span><span class='BinaryOp' id='3407-3415'><span class='matched' id='3416-3417'>(</span><b class='variable matched' id='3192-3419'>conv_y.total_extent.1</b> <span class='matched Operator' id='3416-3420'>&lt=</span> <span class='IntImm Imm' id='3416-3422'>(int64)2147483647</span><span class='matched' id='3416-3424'>)</span></span><span class='matched' id='3407-3426'>, </span><span class='Call' id='3407-3429'><span class='nav-anchor' id='fn-call-3428'><span class='matched' id='3431-3432'><span class='Symbol matched' id='3433-3434'>halide_error_buffer_extents_too_large</span>(</span><span class='StringImm Imm' id='3431-3436'>"conv_y"</span><span class='matched' id='3431-3438'>, </span><b class='variable matched' id='3192-3440'>conv_y.total_extent.1</b><span class='matched' id='3431-3441'>, </span><span class='IntImm Imm' id='3431-3443'>(int64)2147483647</span><span class='matched' id='3431-3445'>)</span></span></span><span class='matched' id='3407-3447'>)</span></div><div class='AssertStmt WrapLine' id='3234-3449'><div class='node-cost' id='3450-3452'><div id='cc-3451' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-3451'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-cc-3451' class='tooltip cond-tooltop' role='tooltip-cc-3451'>Op Count: 2</span><div id='dc-3451' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3451'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3451' class='tooltip cond-tooltop' role='tooltip-dc-3451'>Bits Moved: 0</span></div><span class='matched' id='3450-3454'><span class='Symbol matched' id='3455-3456'>assert</span>(</span><span class='Not' id='3450-3458'>!<b class='variable matched' id='1635-3460'>conv_y.device_dirty</b></span><span class='matched' id='3450-3461'>, </span><span class='Call' id='3450-3464'><span class='nav-anchor' id='fn-call-3463'><span class='matched' id='3466-3467'><span class='Symbol matched' id='3468-3469'>halide_error_device_dirty_with_no_device_support</span>(</span><span class='StringImm Imm' id='3466-3471'>"Output buffer conv_y"</span><span class='matched' id='3466-3473'>)</span></span></span><span class='matched' id='3450-3475'>)</span></div><div class='AssertStmt WrapLine' id='3234-3477'><div class='node-cost' id='3478-3480'><div id='cc-3479' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-3479'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-cc-3479' class='tooltip cond-tooltop' role='tooltip-cc-3479'>Op Count: 2</span><div id='dc-3479' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3479'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3479' class='tooltip cond-tooltop' role='tooltip-dc-3479'>Bits Moved: 0</span></div><span class='matched' id='3478-3482'><span class='Symbol matched' id='3483-3484'>assert</span>(</span><span class='BinaryOp' id='3478-3486'><span class='matched' id='3487-3488'>(</span><b class='variable matched' id='1581-3490'>conv_y</b> <span class='matched Operator' id='3487-3491'>!=</span> <span class='Reinterpret' id='3487-3493'><span class='matched' id='3494-3495'><span class='Type' id='3496-3497'>(void *)</span>(</span><span class='UIntImm Imm' id='3494-3499'>(uint64)0</span><span class='matched' id='3494-3501'>)</span></span><span class='matched' id='3487-3503'>)</span></span><span class='matched' id='3478-3505'>, </span><span class='Call' id='3478-3508'><span class='nav-anchor' id='fn-call-3507'><span class='matched' id='3510-3511'><span class='Symbol matched' id='3512-3513'>halide_error_host_is_null</span>(</span><span class='StringImm Imm' id='3510-3515'>"Output buffer conv_y"</span><span class='matched' id='3510-3517'>)</span></span></span><span class='matched' id='3478-3519'>)</span></div><div class='Allocate' id='3234-3523'><div class='node-cost' id='3524-3525'><div id='cc-3521' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-3521'   line-cost='0' block-cost='18'   line-cost-color='0' block-cost-color='18'></div><span id='tooltip-cc-3521' class='tooltip cond-tooltop' role='tooltip-cc-3521'>Op Count: 0</span><div id='dc-3521' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3521'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-3521' class='tooltip cond-tooltop' role='tooltip-dc-3521'>Bits Moved: 0</span></div><span class='matched' id='3524-3527'><span class='keyword nav-anchor' id='allocate-3521'>allocate </span><b class='variable matched' id='3522-3530'>kernel</b>[</span><span class='Type' id='3524-3531'>float32</span> * <span class='IntImm Imm' id='3524-3533'>20</span><span class='matched' id='3524-3535'>]</span><button class='icon-btn sync-btn' onclick='scrollToViz("allocate-viz-3521")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><div class='AllocateBody' id='3524-3537'><div class='Block' id='3538-3539'><div class='Produce' id='3540-3542'><div class='node-cost' id='3543-3544'><div id='cc-3541' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-3541'   line-cost='0' block-cost='3'   line-cost-color='0' block-cost-color='3'></div><span id='tooltip-cc-3541' class='tooltip cond-tooltop' role='tooltip-cc-3541'>Op Count: 0</span><div id='dc-3541' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3541'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-3541' class='tooltip cond-tooltop' role='tooltip-dc-3541'>Bits Moved: 0</span></div><a onclick='return toggle(3541);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=3541-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=3541-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='3543-3546'><span class='keyword nav-anchor' id='prodcons-3541'>produce </span><b class='variable matched' id='3541-3549'>kernel</b></span></a><button class='icon-btn sync-btn' onclick='scrollToViz("prodcons-viz-3541")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='3543-3550'>{</span><div class='indent ProducerConsumerBody' id='3541'><div class='For' id='3552-3554'><div class='node-cost' id='3555-3556'><div id='cc-3553' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-3553'   line-cost='0' block-cost='3'   line-cost-color='0' block-cost-color='3'></div><span id='tooltip-cc-3553' class='tooltip cond-tooltop' role='tooltip-cc-3553'>Op Count: 0</span><div id='dc-3553' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3553'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-dc-3553' class='tooltip cond-tooltop' role='tooltip-dc-3553'>Bits Moved: 0</span></div><a onclick='return toggle(3553);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=3553-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=3553-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='3555-3558'><span class='keyword nav-anchor' id='loop-3553'>for</span> (</span><b class='variable matched' id='3553-3561'>kernel.s0.x</b><span class='matched' id='3555-3562'>, </span><span class='IntImm Imm' id='3555-3564'>0</span><span class='matched' id='3555-3566'>, </span><span class='IntImm Imm' id='3555-3568'>20</span><span class='matched' id='3555-3570'>)</span></a><button class='icon-btn sync-btn' onclick='scrollToViz("loop-viz-3553")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='3555-3572'>{</span><div class='indent ForBody' id='3553'><div class='Store WrapLine' id='3574-3576'><div class='node-cost' id='3577-3578'><div id='cc-3575' class='cost-btn CostColor3'   aria-describedby='tooltip-cc-3575'   line-cost='3' block-cost='3'   line-cost-color='3' block-cost-color='3'></div><span id='tooltip-cc-3575' class='tooltip cond-tooltop' role='tooltip-cc-3575'>Op Count: 3</span><div id='dc-3575' class='cost-btn CostColor1'   aria-describedby='tooltip-dc-3575'   line-cost='32' block-cost='32'   line-cost-color='1' block-cost-color='1'></div><span id='tooltip-dc-3575' class='tooltip cond-tooltop' role='tooltip-dc-3575'>Bits Moved: 32</span></div><span class='matched' id='3577-3580'><span class='nav-anchor' id='store-3575'><b class='variable matched' id='3541-3583'>kernel</b>[</span></span><b class='variable matched' id='3553-3584'>kernel.s0.x</b><span class='matched' id='3577-3585'>]</span><span class='Operator Assign Matched' id='3577-3587'> = </span><span class='StoreValue' id='3577-3589'><span class='Call' id='3590-3592'><span class='nav-anchor' id='fn-call-3591'><span class='matched' id='3594-3595'><span class='Symbol matched' id='3596-3597'>exp_f32</span>(</span><span class='Cast' id='3594-3599'><span class='matched' id='3600-3601'><span class='Type' id='3602-3603'>float32</span>(</span><span class='BinaryOp' id='3600-3605'><span class='matched' id='3606-3607'>(</span><span class='IntImm Imm' id='3606-3609'>0</span> <span class='matched Operator' id='3606-3611'>-</span> <b class='variable matched' id='3553-3613'>kernel.s0.x</b><span class='matched' id='3606-3614'>)</span></span><span class='matched' id='3600-3616'>)</span></span><span class='matched' id='3594-3618'>)</span></span></span></span></div></div><span class='matched ClosingBrace cb-3553' id='3555-3620'>}</span></div></div><span class='matched ClosingBrace cb-3541' id='3543-3622'>}</span></div><div class='LetStmt' id='3540-3626'><div class='node-cost' id='3627-3628'><div id='cc-3624' class='cost-btn CostColor5'   aria-describedby='tooltip-cc-3624'   line-cost='5' block-cost='15'   line-cost-color='5' block-cost-color='15'></div><span id='tooltip-cc-3624' class='tooltip cond-tooltop' role='tooltip-cc-3624'>Op Count: 5</span><div id='dc-3624' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3624'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3624' class='tooltip cond-tooltop' role='tooltip-dc-3624'>Bits Moved: 0</span></div><p class='WrapLine' id='3627-3630'><span class='cost-highlight' id='cost-bg-3624'><span class='matched' id='3632-3633'><span class='keyword' id='3634-3635'>let </span><b class='variable matched' id='3625-3637'>conv_y.s0.x.xo.tile.loop_extent</b><span class='Operator Assign' id='3634-3638'> = </span></span><span class='BinaryOp' id='3632-3640'><span class='matched' id='3641-3642'>(</span><span class='BinaryOp' id='3641-3644'><span class='matched' id='3645-3646'>(</span><span class='BinaryOp' id='3645-3648'><span class='matched' id='3649-3650'>(</span><b class='variable matched' id='1720-3652'>conv_y.extent.0</b> <span class='matched Operator' id='3649-3653'>+</span> <span class='IntImm Imm' id='3649-3655'>63</span><span class='matched' id='3649-3657'>)</span></span> <span class='matched Operator' id='3645-3659'>/</span> <span class='IntImm Imm' id='3645-3661'>64</span><span class='matched' id='3645-3663'>)</span></span> <span class='matched Operator' id='3641-3665'>*</span> <span class='BinaryOp' id='3641-3667'><span class='matched' id='3668-3669'>(</span><span class='BinaryOp' id='3668-3671'><span class='matched' id='3672-3673'>(</span><b class='variable matched' id='1813-3675'>conv_y.extent.1</b> <span class='matched Operator' id='3672-3676'>+</span> <span class='IntImm Imm' id='3672-3678'>63</span><span class='matched' id='3672-3680'>)</span></span> <span class='matched Operator' id='3668-3682'>/</span> <span class='IntImm Imm' id='3668-3684'>64</span><span class='matched' id='3668-3686'>)</span></span><span class='matched' id='3641-3688'>)</span></span></span></p><div class='Produce' id='3627-3691'><div class='node-cost' id='3692-3693'><div id='cc-3690' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-3690'   line-cost='0' block-cost='10'   line-cost-color='0' block-cost-color='10'></div><span id='tooltip-cc-3690' class='tooltip cond-tooltop' role='tooltip-cc-3690'>Op Count: 0</span><div id='dc-3690' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3690'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3690' class='tooltip cond-tooltop' role='tooltip-dc-3690'>Bits Moved: 0</span></div><a onclick='return toggle(3690);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=3690-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=3690-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='3692-3695'><span class='keyword nav-anchor' id='prodcons-3690'>produce </span><b class='variable matched' id='3690-3698'>conv_y</b></span></a><button class='icon-btn sync-btn' onclick='scrollToViz("prodcons-viz-3690")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='3692-3699'>{</span><div class='indent ProducerConsumerBody' id='3690'><div class='Consumer' id='3701-3703'><div class='node-cost' id='3704-3705'><div id='cc-3702' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-3702'   line-cost='0' block-cost='10'   line-cost-color='0' block-cost-color='10'></div><span id='tooltip-cc-3702' class='tooltip cond-tooltop' role='tooltip-cc-3702'>Op Count: 0</span><div id='dc-3702' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3702'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3702' class='tooltip cond-tooltop' role='tooltip-dc-3702'>Bits Moved: 0</span></div><a onclick='return toggle(3702);' href=_blank><div class='show-hide-btn-wrapper'>  <div class='show-hide-btn' style='display:none;' id=3702-show>    <i class='bi bi-plus-square' title='Expand code block'></i>  </div>  <div class='show-hide-btn' id=3702-hide>    <i class='bi bi-dash-square' title='Collapse code block'></i>  </div></div><span class='matched' id='3704-3707'><span class='keyword nav-anchor' id='prodcons-3702'>consume </span><b class='variable matched' id='3702-3710'>kernel</b></span></a><button class='icon-btn sync-btn' onclick='scrollToViz("prodcons-viz-3702")'>  <i class='bi bi-arrow-right-square' title='Jump to visualization'></i></button><span class='matched' id='3704-3711'>{</span><div class='indent ProducerConsumerBody' id='3702'><div class='LetStmt' id='3713-3716'><div class='node-cost' id='3717-3718'><div id='cc-3714' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-3714'   line-cost='2' block-cost='10'   line-cost-color='2' block-cost-color='10'></div><span id='tooltip-cc-3714' class='tooltip cond-tooltop' role='tooltip-cc-3714'>Op Count: 2</span><div id='dc-3714' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3714'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3714' class='tooltip cond-tooltop' role='tooltip-dc-3714'>Bits Moved: 0</span></div><p class='WrapLine' id='3717-3720'><span class='cost-highlight' id='cost-bg-3714'><span class='matched' id='3722-3723'><span class='keyword' id='3724-3725'>let </span><b class='variable matched' id='3715-3727'>t139</b><span class='Operator Assign' id='3724-3728'> = </span></span><span class='BinaryOp' id='3722-3730'><span class='matched' id='3731-3732'>(</span><span class='BinaryOp' id='3731-3734'><span class='matched' id='3735-3736'>(</span><b class='variable matched' id='1720-3738'>conv_y.extent.0</b> <span class='matched Operator' id='3735-3739'>+</span> <span class='IntImm Imm' id='3735-3741'>63</span><span class='matched' id='3735-3743'>)</span></span> <span class='matched Operator' id='3731-3745'>/</span> <span class='IntImm Imm' id='3731-3747'>64</span><span class='matched' id='3731-3749'>)</span></span></span></p><div class='LetStmt' id='3717-3753'><div class='node-cost' id='3754-3755'><div id='cc-3751' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-3751'   line-cost='2' block-cost='8'   line-cost-color='2' block-cost-color='8'></div><span id='tooltip-cc-3751' class='tooltip cond-tooltop' role='tooltip-cc-3751'>Op Count: 2</span><div id='dc-3751' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3751'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3751' class='tooltip cond-tooltop' role='tooltip-dc-3751'>Bits Moved: 0</span></div><p class='WrapLine' id='3754-3757'><span class='cost-highlight' id='cost-bg-3751'><span class='matched' id='3759-3760'><span class='keyword' id='3761-3762'>let </span><b class='variable matched' id='3752-3764'>t141</b><span class='Operator Assign' id='3761-3765'> = </span></span><span class='BinaryOp' id='3759-3767'><span class='matched' id='3768-3769'>(</span><span class='IntImm Imm' id='3768-3771'>0</span> <span class='matched Operator' id='3768-3773'>-</span> <span class='BinaryOp' id='3768-3775'><span class='matched' id='3776-3777'>(</span><b class='variable matched' id='1782-3779'>conv_y.min.1</b> <span class='matched Operator' id='3776-3780'>*</span> <b class='variable matched' id='1844-3782'>conv_y.stride.1</b><span class='matched' id='3776-3783'>)</span></span><span class='matched' id='3768-3785'>)</span></span></span></p><div class='LetStmt' id='3754-3789'><div class='node-cost' id='3790-3791'><div id='cc-3787' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-3787'   line-cost='1' block-cost='6'   line-cost-color='1' block-cost-color='6'></div><span id='tooltip-cc-3787' class='tooltip cond-tooltop' role='tooltip-cc-3787'>Op Count: 1</span><div id='dc-3787' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3787'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3787' class='tooltip cond-tooltop' role='tooltip-dc-3787'>Bits Moved: 0</span></div><p class='WrapLine' id='3790-3793'><span class='cost-highlight' id='cost-bg-3787'><span class='matched' id='3795-3796'><span class='keyword' id='3797-3798'>let </span><b class='variable matched' id='3788-3800'>t140</b><span class='Operator Assign' id='3797-3801'> = </span></span><span class='BinaryOp' id='3795-3803'><span class='matched' id='3804-3805'>(</span><b class='variable matched' id='1689-3807'>conv_y.min.0</b> <span class='matched Operator' id='3804-3808'>+</span> <b class='variable matched' id='1782-3810'>conv_y.min.1</b><span class='matched' id='3804-3811'>)</span></span></span></p><div class='LetStmt' id='3790-3815'><div class='node-cost' id='3816-3817'><div id='cc-3813' class='cost-btn CostColor1'   aria-describedby='tooltip-cc-3813'   line-cost='1' block-cost='5'   line-cost-color='1' block-cost-color='5'></div><span id='tooltip-cc-3813' class='tooltip cond-tooltop' role='tooltip-cc-3813'>Op Count: 1</span><div id='dc-3813' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3813'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3813' class='tooltip cond-tooltop' role='tooltip-dc-3813'>Bits Moved: 0</span></div><p class='WrapLine' id='3816-3819'><span class='cost-highlight' id='cost-bg-3813'><span class='matched' id='3821-3822'><span class='keyword' id='3823-3824'>let </span><b class='variable matched' id='3814-3826'>parallel_closure</b><span class='Operator Assign' id='3823-3827'> = </span></span><span class='Call' id='3821-3830'><span class='nav-anchor' id='fn-call-3829'><span class='matched' id='3832-3833'><span class='Symbol matched' id='3834-3835'>make_struct</span>(</span><b class='variable matched' id='3690-3837'>conv_y</b><span class='matched' id='3832-3838'>, </span><b class='variable matched' id='3702-3840'>kernel</b><span class='matched' id='3832-3841'>, </span><b class='variable matched' id='1720-3843'>conv_y.extent.0</b><span class='matched' id='3832-3844'>, </span><b class='variable matched' id='1813-3846'>conv_y.extent.1</b><span class='matched' id='3832-3847'>, </span><b class='variable matched' id='1782-3849'>conv_y.min.1</b><span class='matched' id='3832-3850'>, </span><b class='variable matched' id='1844-3852'>conv_y.stride.1</b><span class='matched' id='3832-3853'>, </span><b class='variable matched' id='3715-3855'>t139</b><span class='matched' id='3832-3856'>, </span><b class='variable matched' id='3788-3858'>t140</b><span class='matched' id='3832-3859'>, </span><b class='variable matched' id='3752-3861'>t141</b><span class='matched' id='3832-3862'>)</span></span></span></span></p><div class='LetStmt' id='3816-3866'><div class='node-cost' id='3867-3868'><div id='cc-3864' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-3864'   line-cost='2' block-cost='4'   line-cost-color='2' block-cost-color='4'></div><span id='tooltip-cc-3864' class='tooltip cond-tooltop' role='tooltip-cc-3864'>Op Count: 2</span><div id='dc-3864' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3864'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3864' class='tooltip cond-tooltop' role='tooltip-dc-3864'>Bits Moved: 0</span></div><p class='WrapLine' id='3867-3870'><span class='cost-highlight' id='cost-bg-3864'><span class='matched' id='3872-3873'><span class='keyword' id='3874-3875'>let </span><b class='variable matched' id='3865-3877'>closure_result</b><span class='Operator Assign' id='3874-3878'> = </span></span><span class='Call' id='3872-3881'><span class='nav-anchor' id='fn-call-3880'><span class='matched' id='3883-3884'><span class='Symbol matched' id='3885-3886'>halide_do_par_for</span>(</span><b class='variable matched' id='3888-3889'>::conv_y_par_for_conv_y_s0_x_xo_tile</b><span class='matched' id='3883-3890'>, </span><span class='IntImm Imm' id='3883-3892'>0</span><span class='matched' id='3883-3894'>, </span><b class='variable matched' id='3625-3896'>conv_y.s0.x.xo.tile.loop_extent</b><span class='matched' id='3883-3897'>, </span><span class='Cast' id='3883-3899'><span class='matched' id='3900-3901'><span class='Type' id='3902-3903'>(uint8_t *)</span>(</span><b class='variable matched' id='3814-3905'>parallel_closure</b><span class='matched' id='3900-3906'>)</span></span><span class='matched' id='3883-3908'>)</span></span></span></span></p><div class='AssertStmt WrapLine' id='3867-3910'><div class='node-cost' id='3911-3913'><div id='cc-3912' class='cost-btn CostColor2'   aria-describedby='tooltip-cc-3912'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-cc-3912' class='tooltip cond-tooltop' role='tooltip-cc-3912'>Op Count: 2</span><div id='dc-3912' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3912'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3912' class='tooltip cond-tooltop' role='tooltip-dc-3912'>Bits Moved: 0</span></div><span class='matched' id='3911-3915'><span class='Symbol matched' id='3916-3917'>assert</span>(</span><span class='BinaryOp' id='3911-3919'><span class='matched' id='3920-3921'>(</span><b class='variable matched' id='3865-3923'>closure_result</b> <span class='matched Operator' id='3920-3924'>==</span> <span class='IntImm Imm' id='3920-3926'>0</span><span class='matched' id='3920-3928'>)</span></span><span class='matched' id='3911-3930'>, </span><b class='variable matched' id='3865-3932'>closure_result</b><span class='matched' id='3911-3933'>)</span></div></div></div></div></div></div></div><span class='matched ClosingBrace cb-3702' id='3704-3935'>}</span></div></div><span class='matched ClosingBrace cb-3690' id='3692-3937'>}</span></div></div><div class='Free WrapLine' id='3540-3939'><div class='node-cost' id='3940-3942'><div id='cc-3941' class='cost-btn CostColor0'   aria-describedby='tooltip-cc-3941'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-cc-3941' class='tooltip cond-tooltop' role='tooltip-cc-3941'>Op Count: 0</span><div id='dc-3941' class='cost-btn CostColor0'   aria-describedby='tooltip-dc-3941'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-dc-3941' class='tooltip cond-tooltop' role='tooltip-dc-3941'>Bits Moved: 0</span></div><span class='keyword' id='3940-3944'>free </span><b class='variable matched' id='3522-3946'>kernel</b></div></div></div></div></div></div></div></div><span class='matched ClosingBrace cb-2617' id='2620-3947'>}</span></div></div></div></div></div></div></div></div></div></div></div></div></div></div></div></div></div><span class='matched ClosingBrace cb-1521' id='1523-3949'>}</span></div></div><span class='matched ClosingBrace cb-0' id='2-3951'>}</span></div></div>
+<div class='resize-bar' id='resize-bar-1'>
+                       <div class='collapse-btns'>
+                         <div>
+                           <button class='icon-btn resize-btn' onclick='collapse_code_tab()'>
+                             <i class='bi bi-arrow-bar-left' title='Collapse code tab'></i>
+                           </button>
+                         </div>
+                         <div>
+                           <button class='icon-btn resize-btn' onclick='collapseR_visualization_tab()'>
+                             <i class='bi bi-arrow-bar-right' title='Collapse visualization tab'></i>
+                           </button>
+                         </div>
+                       </div>
+                     </div><div id='ir-visualization-tab'>
+<div class='center fn-wrapper'><div class='fn-header'><button class='icon-btn' id='viz-0-hide' onclick='return toggleViz("viz-0");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-0-show' style = 'display:none;' onclick='return toggleViz("viz-0");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("lowered-func-conv_y_par_for_conv_y_s0_x_xo_tile")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><span class='fn-title' id='lowered-func-viz-conv_y_par_for_conv_y_s0_x_xo_tile'>Func: conv_y_par_for_conv_y_s0_x_xo_tile</span></div><div class='fn-body' id='viz-0'><div class='box center ForBox'><div class='box-header'><button class='icon-btn' id='viz-614-hide' onclick='return toggleViz("viz-614");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-614-show' style = 'display:none;' onclick='return toggleViz("viz-614");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("loop-614")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><button class='icon-btn sync-btn' onclick='scrollToAsm("16535")'>  <i class='bi bi-arrow-right-square' title='Jump to assembly'></i></button><div class='box-title'><span class='' id='loop-viz-614'>For: <b class='variable matched'>conv_y.s0.y.yi</b></span></div><div class='viz-cost-btns'><div id='vcc-614' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-614'   line-cost='0' block-cost='48'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vcc-614' class='tooltip cond-tooltop' role='tooltip-vcc-614'>Op Count: 0</span><div id='vdc-614' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-614'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vdc-614' class='tooltip cond-tooltop' role='tooltip-vdc-614'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-614'><table class='allocate-table'><tr><th scope='col'>Min</th><td><span class='IntImm Imm' id='0-0'>0</span></td></tr><tr><th scope='col'>Extent</th><td><span class='IntImm Imm' id='0-0'>64</span></td></tr></table><div class='box center ForBox'><div class='box-header'><button class='icon-btn' id='viz-706-hide' onclick='return toggleViz("viz-706");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-706-show' style = 'display:none;' onclick='return toggleViz("viz-706");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("loop-706")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><button class='icon-btn sync-btn' onclick='scrollToAsm("16549")'>  <i class='bi bi-arrow-right-square' title='Jump to assembly'></i></button><div class='box-title'><span class='' id='loop-viz-706'>For: <b class='variable matched'>conv_y.s0.x.xi.xi</b></span></div><div class='viz-cost-btns'><div id='vcc-706' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-706'   line-cost='0' block-cost='44'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vcc-706' class='tooltip cond-tooltop' role='tooltip-vcc-706'>Op Count: 0</span><div id='vdc-706' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-706'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vdc-706' class='tooltip cond-tooltop' role='tooltip-vdc-706'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-706'><table class='allocate-table'><tr><th scope='col'>Min</th><td><span class='IntImm Imm' id='0-0'>0</span></td></tr><tr><th scope='col'>Extent</th><td><span class='IntImm Imm' id='0-0'>16</span></td></tr></table><div class='box center AllocateBox'><div class='box-header'><button class='icon-btn' id='viz-728-hide' onclick='return toggleViz("viz-728");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-728-show' style = 'display:none;' onclick='return toggleViz("viz-728");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("allocate-728")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='allocate-viz-728'>Allocate: conv_x</span></div><div class='viz-cost-btns'><div id='vcc-728' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-728'   line-cost='0' block-cost='44'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vcc-728' class='tooltip cond-tooltop' role='tooltip-vcc-728'>Op Count: 0</span><div id='vdc-728' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-728'   line-cost='0' block-cost='1472'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vdc-728' class='tooltip cond-tooltop' role='tooltip-vdc-728'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-728'><table class='allocate-table'><tr><th scope='col'>Memory Type</th><td>Auto</td></tr><tr><th scope='col'>Data Type</th><td>float32</td></tr><tr><th scope='col'>Dim-0</th><td><span class='IntImm Imm' id='0-0'>4</span></td></tr><tr><th scope='col'>Dim-1</th><td><span class='IntImm Imm' id='0-0'>20</span></td></tr></table><div class='box center ProducerBox'><div class='box-header'><button class='icon-btn' id='viz-750-hide' onclick='return toggleViz("viz-750");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-750-show' style = 'display:none;' onclick='return toggleViz("viz-750");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("prodcons-750")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='prodcons-viz-750'>Produce: <b class='variable matched'>conv_x</b></span></div><div class='viz-cost-btns'><div id='vcc-750' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-750'   line-cost='0' block-cost='25'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vcc-750' class='tooltip cond-tooltop' role='tooltip-vcc-750'>Op Count: 0</span><div id='vdc-750' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-750'   line-cost='0' block-cost='672'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vdc-750' class='tooltip cond-tooltop' role='tooltip-vdc-750'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-750'><div class='box center ForBox'><div class='box-header'><button class='icon-btn' id='viz-798-hide' onclick='return toggleViz("viz-798");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-798-show' style = 'display:none;' onclick='return toggleViz("viz-798");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("loop-798")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><button class='icon-btn sync-btn' onclick='scrollToAsm("16559")'>  <i class='bi bi-arrow-right-square' title='Jump to assembly'></i></button><div class='box-title'><span class='' id='loop-viz-798'>For: <b class='variable matched'>conv_x.s0.y.rebased</b></span></div><div class='viz-cost-btns'><div id='vcc-798' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-798'   line-cost='0' block-cost='23'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vcc-798' class='tooltip cond-tooltop' role='tooltip-vcc-798'>Op Count: 0</span><div id='vdc-798' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-798'   line-cost='0' block-cost='672'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vdc-798' class='tooltip cond-tooltop' role='tooltip-vdc-798'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-798'><table class='allocate-table'><tr><th scope='col'>Min</th><td><span class='IntImm Imm' id='0-0'>0</span></td></tr><tr><th scope='col'>Extent</th><td><span class='IntImm Imm' id='0-0'>20</span></td></tr></table><div class='box center AllocateBox'><div class='box-header'><button class='icon-btn' id='viz-820-hide' onclick='return toggleViz("viz-820");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-820-show' style = 'display:none;' onclick='return toggleViz("viz-820");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("allocate-820")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='allocate-viz-820'>Allocate: conv_x$1</span></div><div class='viz-cost-btns'><div id='vcc-820' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-820'   line-cost='0' block-cost='23'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vcc-820' class='tooltip cond-tooltop' role='tooltip-vcc-820'>Op Count: 0</span><div id='vdc-820' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-820'   line-cost='0' block-cost='672'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vdc-820' class='tooltip cond-tooltop' role='tooltip-vdc-820'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-820'><table class='allocate-table'><tr><th scope='col'>Memory Type</th><td>Auto</td></tr><tr><th scope='col'>Data Type</th><td>float32</td></tr><tr><th scope='col'>Dim-0</th><td><span class='IntImm Imm' id='0-0'>4</span></td></tr></table><div class='box center ProducerBox'><div class='box-header'><button class='icon-btn' id='viz-840-hide' onclick='return toggleViz("viz-840");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-840-show' style = 'display:none;' onclick='return toggleViz("viz-840");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("prodcons-840")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='prodcons-viz-840'>Produce: <b class='variable matched'>conv_x$1</b></span></div><div class='viz-cost-btns'><div id='vcc-840' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-840'   line-cost='0' block-cost='20'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vcc-840' class='tooltip cond-tooltop' role='tooltip-vcc-840'>Op Count: 0</span><div id='vdc-840' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-840'   line-cost='0' block-cost='416'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vdc-840' class='tooltip cond-tooltop' role='tooltip-vdc-840'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-840'><div class='box center StoreBox'><div class='box-header'><button class='icon-btn' id='viz-854-hide' onclick='return toggleViz("viz-854");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-854-show' style = 'display:none;' onclick='return toggleViz("viz-854");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("store-854")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='store-viz-854'>Store: <b class='variable matched'>conv_x$1</b></span></div><div class='viz-cost-btns'><div id='vcc-854' class='cost-btn CostColor2'   aria-describedby='tooltip-vcc-854'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-vcc-854' class='tooltip cond-tooltop' role='tooltip-vcc-854'>Op Count: 2</span><div id='vdc-854' class='cost-btn CostColor6'   aria-describedby='tooltip-vdc-854'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-vdc-854' class='tooltip cond-tooltop' role='tooltip-vdc-854'>Bits Moved: 128</span></div></div><div class='box-body' id='viz-854'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Dense Vector</td></tr><tr><th scope='col'>Output Tile</th><td>float32x4</td></tr></table></div></div><div class='box center ForBox'><div class='box-header'><button class='icon-btn' id='viz-921-hide' onclick='return toggleViz("viz-921");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-921-show' style = 'display:none;' onclick='return toggleViz("viz-921");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("loop-921")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><button class='icon-btn sync-btn' onclick='scrollToAsm("16567")'>  <i class='bi bi-arrow-right-square' title='Jump to assembly'></i></button><div class='box-title'><span class='' id='loop-viz-921'>For: <b class='variable matched'>conv_x$1.s1.k$x</b></span></div><div class='viz-cost-btns'><div id='vcc-921' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-921'   line-cost='0' block-cost='17'   line-cost-color='0' block-cost-color='17'></div><span id='tooltip-vcc-921' class='tooltip cond-tooltop' role='tooltip-vcc-921'>Op Count: 0</span><div id='vdc-921' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-921'   line-cost='0' block-cost='288'   line-cost-color='0' block-cost-color='13'></div><span id='tooltip-vdc-921' class='tooltip cond-tooltop' role='tooltip-vdc-921'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-921'><table class='allocate-table'><tr><th scope='col'>Min</th><td><span class='IntImm Imm' id='0-0'>0</span></td></tr><tr><th scope='col'>Extent</th><td><span class='IntImm Imm' id='0-0'>20</span></td></tr></table><div class='box center LoadBox'><div class='box-header'><button class='icon-btn' id='viz-980-hide' onclick='return toggleViz("viz-980");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-980-show' style = 'display:none;' onclick='return toggleViz("viz-980");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("load-980")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='load-viz-980'>Load: <b class='variable matched'>conv_x$1</b></span></div><div class='viz-cost-btns'><div id='vcc-980' class='cost-btn CostColor1'   aria-describedby='tooltip-vcc-980'   line-cost='1' block-cost='1'   line-cost-color='1' block-cost-color='1'></div><span id='tooltip-vcc-980' class='tooltip cond-tooltop' role='tooltip-vcc-980'>Op Count: 1</span><div id='vdc-980' class='cost-btn CostColor6'   aria-describedby='tooltip-vdc-980'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-vdc-980' class='tooltip cond-tooltop' role='tooltip-vdc-980'>Bits Moved: 128</span></div></div><div class='box-body' id='viz-980'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Dense Vector</td></tr><tr><th scope='col'>Output Tile</th><td>float32x4</td></tr></table></div></div><div class='box center LoadBox'><div class='box-header'><button class='icon-btn' id='viz-1061-hide' onclick='return toggleViz("viz-1061");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1061-show' style = 'display:none;' onclick='return toggleViz("viz-1061");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("load-1061")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='load-viz-1061'>Load: <b class='variable matched'>kernel</b></span></div><div class='viz-cost-btns'><div id='vcc-1061' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-1061'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-vcc-1061' class='tooltip cond-tooltop' role='tooltip-vcc-1061'>Op Count: 0</span><div id='vdc-1061' class='cost-btn CostColor1'   aria-describedby='tooltip-vdc-1061'   line-cost='32' block-cost='32'   line-cost-color='1' block-cost-color='1'></div><span id='tooltip-vdc-1061' class='tooltip cond-tooltop' role='tooltip-vdc-1061'>Bits Moved: 32</span></div></div><div class='box-body' id='viz-1061'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Scalar</td></tr><tr><th scope='col'>Output</th><td>float32</td></tr></table></div></div><div class='box center StoreBox'><div class='box-header'><button class='icon-btn' id='viz-943-hide' onclick='return toggleViz("viz-943");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-943-show' style = 'display:none;' onclick='return toggleViz("viz-943");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("store-943")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='store-viz-943'>Store: <b class='variable matched'>conv_x$1</b></span></div><div class='viz-cost-btns'><div id='vcc-943' class='cost-btn CostColor17'   aria-describedby='tooltip-vcc-943'   line-cost='17' block-cost='17'   line-cost-color='17' block-cost-color='17'></div><span id='tooltip-vcc-943' class='tooltip cond-tooltop' role='tooltip-vcc-943'>Op Count: 17</span><div id='vdc-943' class='cost-btn CostColor6'   aria-describedby='tooltip-vdc-943'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-vdc-943' class='tooltip cond-tooltop' role='tooltip-vdc-943'>Bits Moved: 128</span></div></div><div class='box-body' id='viz-943'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Dense Vector</td></tr><tr><th scope='col'>Output Tile</th><td>float32x4</td></tr></table></div></div></div></div></div></div><div class='box center ConsumerBox'><div class='box-header'><button class='icon-btn' id='viz-1079-hide' onclick='return toggleViz("viz-1079");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1079-show' style = 'display:none;' onclick='return toggleViz("viz-1079");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("prodcons-1079")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><button class='icon-btn sync-btn' onclick='scrollToAsm("16582")'>  <i class='bi bi-arrow-right-square' title='Jump to assembly'></i></button><div class='box-title'><span class='' id='prodcons-viz-1079'>Consume: <b class='variable matched'>conv_x$1</b></span></div><div class='viz-cost-btns'><div id='vcc-1079' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-1079'   line-cost='0' block-cost='3'   line-cost-color='0' block-cost-color='3'></div><span id='tooltip-vcc-1079' class='tooltip cond-tooltop' role='tooltip-vcc-1079'>Op Count: 0</span><div id='vdc-1079' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-1079'   line-cost='0' block-cost='256'   line-cost-color='0' block-cost-color='12'></div><span id='tooltip-vdc-1079' class='tooltip cond-tooltop' role='tooltip-vdc-1079'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-1079'><div class='box center LoadBox'><div class='box-header'><button class='icon-btn' id='viz-1133-hide' onclick='return toggleViz("viz-1133");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1133-show' style = 'display:none;' onclick='return toggleViz("viz-1133");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("load-1133")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='load-viz-1133'>Load: <b class='variable matched'>conv_x$1</b></span></div><div class='viz-cost-btns'><div id='vcc-1133' class='cost-btn CostColor1'   aria-describedby='tooltip-vcc-1133'   line-cost='1' block-cost='1'   line-cost-color='1' block-cost-color='1'></div><span id='tooltip-vcc-1133' class='tooltip cond-tooltop' role='tooltip-vcc-1133'>Op Count: 1</span><div id='vdc-1133' class='cost-btn CostColor6'   aria-describedby='tooltip-vdc-1133'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-vdc-1133' class='tooltip cond-tooltop' role='tooltip-vdc-1133'>Bits Moved: 128</span></div></div><div class='box-body' id='viz-1133'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Dense Vector</td></tr><tr><th scope='col'>Output Tile</th><td>float32x4</td></tr></table></div></div><div class='box center StoreBox'><div class='box-header'><button class='icon-btn' id='viz-1091-hide' onclick='return toggleViz("viz-1091");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1091-show' style = 'display:none;' onclick='return toggleViz("viz-1091");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("store-1091")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='store-viz-1091'>Store: <b class='variable matched'>conv_x</b></span></div><div class='viz-cost-btns'><div id='vcc-1091' class='cost-btn CostColor3'   aria-describedby='tooltip-vcc-1091'   line-cost='3' block-cost='3'   line-cost-color='3' block-cost-color='3'></div><span id='tooltip-vcc-1091' class='tooltip cond-tooltop' role='tooltip-vcc-1091'>Op Count: 3</span><div id='vdc-1091' class='cost-btn CostColor6'   aria-describedby='tooltip-vdc-1091'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-vdc-1091' class='tooltip cond-tooltop' role='tooltip-vdc-1091'>Bits Moved: 128</span></div></div><div class='box-body' id='viz-1091'><table class='allocate-table'><tr><th scope='col'>Alignment</th><td>aligned(4, 0)</td></tr><tr><th scope='col'>Type</th><td>Dense Vector</td></tr><tr><th scope='col'>Output Tile</th><td>float32x4</td></tr></table></div></div></div></div></div></div></div></div></div></div><div class='box center ConsumerBox'><div class='box-header'><button class='icon-btn' id='viz-1172-hide' onclick='return toggleViz("viz-1172");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1172-show' style = 'display:none;' onclick='return toggleViz("viz-1172");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("prodcons-1172")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='prodcons-viz-1172'>Consume: <b class='variable matched'>conv_x</b></span></div><div class='viz-cost-btns'><div id='vcc-1172' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-1172'   line-cost='0' block-cost='19'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vcc-1172' class='tooltip cond-tooltop' role='tooltip-vcc-1172'>Op Count: 0</span><div id='vdc-1172' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-1172'   line-cost='0' block-cost='800'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vdc-1172' class='tooltip cond-tooltop' role='tooltip-vdc-1172'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-1172'><div class='box center AllocateBox'><div class='box-header'><button class='icon-btn' id='viz-1184-hide' onclick='return toggleViz("viz-1184");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1184-show' style = 'display:none;' onclick='return toggleViz("viz-1184");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("allocate-1184")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='allocate-viz-1184'>Allocate: conv_y$1</span></div><div class='viz-cost-btns'><div id='vcc-1184' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-1184'   line-cost='0' block-cost='19'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vcc-1184' class='tooltip cond-tooltop' role='tooltip-vcc-1184'>Op Count: 0</span><div id='vdc-1184' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-1184'   line-cost='0' block-cost='800'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vdc-1184' class='tooltip cond-tooltop' role='tooltip-vdc-1184'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-1184'><table class='allocate-table'><tr><th scope='col'>Memory Type</th><td>Auto</td></tr><tr><th scope='col'>Data Type</th><td>float32</td></tr><tr><th scope='col'>Dim-0</th><td><span class='IntImm Imm' id='0-0'>4</span></td></tr></table><div class='box center ProducerBox'><div class='box-header'><button class='icon-btn' id='viz-1204-hide' onclick='return toggleViz("viz-1204");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1204-show' style = 'display:none;' onclick='return toggleViz("viz-1204");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("prodcons-1204")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='prodcons-viz-1204'>Produce: <b class='variable matched'>conv_y$1</b></span></div><div class='viz-cost-btns'><div id='vcc-1204' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-1204'   line-cost='0' block-cost='15'   line-cost-color='0' block-cost-color='15'></div><span id='tooltip-vcc-1204' class='tooltip cond-tooltop' role='tooltip-vcc-1204'>Op Count: 0</span><div id='vdc-1204' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-1204'   line-cost='0' block-cost='544'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vdc-1204' class='tooltip cond-tooltop' role='tooltip-vdc-1204'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-1204'><div class='box center StoreBox'><div class='box-header'><button class='icon-btn' id='viz-1218-hide' onclick='return toggleViz("viz-1218");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1218-show' style = 'display:none;' onclick='return toggleViz("viz-1218");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("store-1218")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='store-viz-1218'>Store: <b class='variable matched'>conv_y$1</b></span></div><div class='viz-cost-btns'><div id='vcc-1218' class='cost-btn CostColor2'   aria-describedby='tooltip-vcc-1218'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-vcc-1218' class='tooltip cond-tooltop' role='tooltip-vcc-1218'>Op Count: 2</span><div id='vdc-1218' class='cost-btn CostColor6'   aria-describedby='tooltip-vdc-1218'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-vdc-1218' class='tooltip cond-tooltop' role='tooltip-vdc-1218'>Bits Moved: 128</span></div></div><div class='box-body' id='viz-1218'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Dense Vector</td></tr><tr><th scope='col'>Output Tile</th><td>float32x4</td></tr></table></div></div><div class='box center ForBox'><div class='box-header'><button class='icon-btn' id='viz-1259-hide' onclick='return toggleViz("viz-1259");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1259-show' style = 'display:none;' onclick='return toggleViz("viz-1259");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("loop-1259")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><button class='icon-btn sync-btn' onclick='scrollToAsm("16591")'>  <i class='bi bi-arrow-right-square' title='Jump to assembly'></i></button><div class='box-title'><span class='' id='loop-viz-1259'>For: <b class='variable matched'>conv_y$1.s1.k$x</b></span></div><div class='viz-cost-btns'><div id='vcc-1259' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-1259'   line-cost='0' block-cost='13'   line-cost-color='0' block-cost-color='13'></div><span id='tooltip-vcc-1259' class='tooltip cond-tooltop' role='tooltip-vcc-1259'>Op Count: 0</span><div id='vdc-1259' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-1259'   line-cost='0' block-cost='416'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vdc-1259' class='tooltip cond-tooltop' role='tooltip-vdc-1259'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-1259'><table class='allocate-table'><tr><th scope='col'>Min</th><td><span class='IntImm Imm' id='0-0'>0</span></td></tr><tr><th scope='col'>Extent</th><td><span class='IntImm Imm' id='0-0'>20</span></td></tr></table><div class='box center LoadBox'><div class='box-header'><button class='icon-btn' id='viz-1318-hide' onclick='return toggleViz("viz-1318");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1318-show' style = 'display:none;' onclick='return toggleViz("viz-1318");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("load-1318")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='load-viz-1318'>Load: <b class='variable matched'>conv_y$1</b></span></div><div class='viz-cost-btns'><div id='vcc-1318' class='cost-btn CostColor1'   aria-describedby='tooltip-vcc-1318'   line-cost='1' block-cost='1'   line-cost-color='1' block-cost-color='1'></div><span id='tooltip-vcc-1318' class='tooltip cond-tooltop' role='tooltip-vcc-1318'>Op Count: 1</span><div id='vdc-1318' class='cost-btn CostColor6'   aria-describedby='tooltip-vdc-1318'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-vdc-1318' class='tooltip cond-tooltop' role='tooltip-vdc-1318'>Bits Moved: 128</span></div></div><div class='box-body' id='viz-1318'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Dense Vector</td></tr><tr><th scope='col'>Output Tile</th><td>float32x4</td></tr></table></div></div><div class='box center LoadBox'><div class='box-header'><button class='icon-btn' id='viz-1349-hide' onclick='return toggleViz("viz-1349");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1349-show' style = 'display:none;' onclick='return toggleViz("viz-1349");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("load-1349")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='load-viz-1349'>Load: <b class='variable matched'>conv_x</b></span></div><div class='viz-cost-btns'><div id='vcc-1349' class='cost-btn CostColor2'   aria-describedby='tooltip-vcc-1349'   line-cost='2' block-cost='2'   line-cost-color='2' block-cost-color='2'></div><span id='tooltip-vcc-1349' class='tooltip cond-tooltop' role='tooltip-vcc-1349'>Op Count: 2</span><div id='vdc-1349' class='cost-btn CostColor6'   aria-describedby='tooltip-vdc-1349'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-vdc-1349' class='tooltip cond-tooltop' role='tooltip-vdc-1349'>Bits Moved: 128</span></div></div><div class='box-body' id='viz-1349'><table class='allocate-table'><tr><th scope='col'>Alignment</th><td>aligned(4, 0)</td></tr><tr><th scope='col'>Type</th><td>Dense Vector</td></tr><tr><th scope='col'>Output Tile</th><td>float32x4</td></tr></table></div></div><div class='box center LoadBox'><div class='box-header'><button class='icon-btn' id='viz-1389-hide' onclick='return toggleViz("viz-1389");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1389-show' style = 'display:none;' onclick='return toggleViz("viz-1389");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("load-1389")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='load-viz-1389'>Load: <b class='variable matched'>kernel</b></span></div><div class='viz-cost-btns'><div id='vcc-1389' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-1389'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-vcc-1389' class='tooltip cond-tooltop' role='tooltip-vcc-1389'>Op Count: 0</span><div id='vdc-1389' class='cost-btn CostColor1'   aria-describedby='tooltip-vdc-1389'   line-cost='32' block-cost='32'   line-cost-color='1' block-cost-color='1'></div><span id='tooltip-vdc-1389' class='tooltip cond-tooltop' role='tooltip-vdc-1389'>Bits Moved: 32</span></div></div><div class='box-body' id='viz-1389'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Scalar</td></tr><tr><th scope='col'>Output</th><td>float32</td></tr></table></div></div><div class='box center StoreBox'><div class='box-header'><button class='icon-btn' id='viz-1281-hide' onclick='return toggleViz("viz-1281");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1281-show' style = 'display:none;' onclick='return toggleViz("viz-1281");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("store-1281")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='store-viz-1281'>Store: <b class='variable matched'>conv_y$1</b></span></div><div class='viz-cost-btns'><div id='vcc-1281' class='cost-btn CostColor13'   aria-describedby='tooltip-vcc-1281'   line-cost='13' block-cost='13'   line-cost-color='13' block-cost-color='13'></div><span id='tooltip-vcc-1281' class='tooltip cond-tooltop' role='tooltip-vcc-1281'>Op Count: 13</span><div id='vdc-1281' class='cost-btn CostColor6'   aria-describedby='tooltip-vdc-1281'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-vdc-1281' class='tooltip cond-tooltop' role='tooltip-vdc-1281'>Bits Moved: 128</span></div></div><div class='box-body' id='viz-1281'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Dense Vector</td></tr><tr><th scope='col'>Output Tile</th><td>float32x4</td></tr></table></div></div></div></div></div></div><div class='box center ConsumerBox'><div class='box-header'><button class='icon-btn' id='viz-1415-hide' onclick='return toggleViz("viz-1415");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1415-show' style = 'display:none;' onclick='return toggleViz("viz-1415");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("prodcons-1415")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><button class='icon-btn sync-btn' onclick='scrollToAsm("16605")'>  <i class='bi bi-arrow-right-square' title='Jump to assembly'></i></button><div class='box-title'><span class='' id='prodcons-viz-1415'>Consume: <b class='variable matched'>conv_y$1</b></span></div><div class='viz-cost-btns'><div id='vcc-1415' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-1415'   line-cost='0' block-cost='4'   line-cost-color='0' block-cost-color='4'></div><span id='tooltip-vcc-1415' class='tooltip cond-tooltop' role='tooltip-vcc-1415'>Op Count: 0</span><div id='vdc-1415' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-1415'   line-cost='0' block-cost='256'   line-cost-color='0' block-cost-color='12'></div><span id='tooltip-vdc-1415' class='tooltip cond-tooltop' role='tooltip-vdc-1415'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-1415'><div class='box center LoadBox'><div class='box-header'><button class='icon-btn' id='viz-1478-hide' onclick='return toggleViz("viz-1478");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1478-show' style = 'display:none;' onclick='return toggleViz("viz-1478");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("load-1478")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='load-viz-1478'>Load: <b class='variable matched'>conv_y$1</b></span></div><div class='viz-cost-btns'><div id='vcc-1478' class='cost-btn CostColor1'   aria-describedby='tooltip-vcc-1478'   line-cost='1' block-cost='1'   line-cost-color='1' block-cost-color='1'></div><span id='tooltip-vcc-1478' class='tooltip cond-tooltop' role='tooltip-vcc-1478'>Op Count: 1</span><div id='vdc-1478' class='cost-btn CostColor6'   aria-describedby='tooltip-vdc-1478'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-vdc-1478' class='tooltip cond-tooltop' role='tooltip-vdc-1478'>Bits Moved: 128</span></div></div><div class='box-body' id='viz-1478'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Dense Vector</td></tr><tr><th scope='col'>Output Tile</th><td>float32x4</td></tr></table></div></div><div class='box center StoreBox'><div class='box-header'><button class='icon-btn' id='viz-1427-hide' onclick='return toggleViz("viz-1427");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1427-show' style = 'display:none;' onclick='return toggleViz("viz-1427");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("store-1427")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='store-viz-1427'>Store: <b class='variable matched'>conv_y</b></span></div><div class='viz-cost-btns'><div id='vcc-1427' class='cost-btn CostColor4'   aria-describedby='tooltip-vcc-1427'   line-cost='4' block-cost='4'   line-cost-color='4' block-cost-color='4'></div><span id='tooltip-vcc-1427' class='tooltip cond-tooltop' role='tooltip-vcc-1427'>Op Count: 4</span><div id='vdc-1427' class='cost-btn CostColor6'   aria-describedby='tooltip-vdc-1427'   line-cost='128' block-cost='128'   line-cost-color='6' block-cost-color='6'></div><span id='tooltip-vdc-1427' class='tooltip cond-tooltop' role='tooltip-vdc-1427'>Bits Moved: 128</span></div></div><div class='box-body' id='viz-1427'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Dense Vector</td></tr><tr><th scope='col'>Output Tile</th><td>float32x4</td></tr></table></div></div></div></div></div></div></div></div></div></div></div></div></div></div></div></div><div class='center fn-wrapper'><div class='fn-header'><button class='icon-btn' id='viz-1-hide' onclick='return toggleViz("viz-1");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-1-show' style = 'display:none;' onclick='return toggleViz("viz-1");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("lowered-func-conv_y")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><span class='fn-title' id='lowered-func-viz-conv_y'>Func: conv_y</span></div><div class='fn-body' id='viz-1'><div class='tf-tree tf-gap-sm tf-custom-ir-viz'><ul class=''><li class=''><span class='tf-nc if-node'><div class='box center IfBox'><div class='box-header'><button class='icon-btn' id='viz-2-hide' onclick='return toggleViz("viz-2");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-2-show' style = 'display:none;' onclick='return toggleViz("viz-2");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("cond-2385")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='cond-viz-2385'>If: <button title='Click to see path condition' id='cond-3' aria-describedby='cond-tooltip-3' class='trunc-cond' role='button'>...</button><span id='cond-tooltip-3' class='tooltip cond-tooltop' role='cond-tooltip-3'><span class='Call' id='0-0'><span class='nav-anchor' id='fn-call-2393'><span class='matched' id='2-3'><span class='Symbol matched' id='4-5'>_halide_buffer_is_bounds_query</span>(</span><b class='variable matched' id='7-8'>conv_y.buffer</b><span class='matched' id='2-9'>)</span></span></span></span></span></div><div class='viz-cost-btns'><div id='vcc-2' class='cost-btn CostColor11'   aria-describedby='tooltip-vcc-2'   line-cost='11' block-cost='11'   line-cost-color='11' block-cost-color='11'></div><span id='tooltip-vcc-2' class='tooltip cond-tooltop' role='tooltip-vcc-2'>Op Count: 11</span><div id='vdc-2' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-2'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-vdc-2' class='tooltip cond-tooltop' role='tooltip-vdc-2'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-2'><div class='fn-call'><button class='icon-btn sync-btn' onclick='scrollToCode("fn-call-2438")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><b class='variable matched'>_halide_buffer_init</b>(...)</div></div></div></span></li></ul></div><div class='tf-tree tf-gap-sm tf-custom-ir-viz'><ul class=''><li class=''><span class='tf-nc if-node'><div class='box center IfBox'><div class='box-header'><button class='icon-btn' id='viz-4-hide' onclick='return toggleViz("viz-4");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-4-show' style = 'display:none;' onclick='return toggleViz("viz-4");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("cond-2618")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='cond-viz-2618'>If: <button title='Click to see path condition' id='cond-5' aria-describedby='cond-tooltip-5' class='trunc-cond' role='button'>...</button><span id='cond-tooltip-5' class='tooltip cond-tooltop' role='cond-tooltip-5'><span class='Not' id='0-0'>!<span class='Call' id='1-2'><span class='nav-anchor' id='fn-call-2393'><span class='matched' id='4-5'><span class='Symbol matched' id='6-7'>_halide_buffer_is_bounds_query</span>(</span><b class='variable matched' id='9-10'>conv_y.buffer</b><span class='matched' id='4-11'>)</span></span></span></span></span></span></div><div class='viz-cost-btns'><div id='vcc-4' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-4'   line-cost='0' block-cost='58'   line-cost-color='0' block-cost-color='19'></div><span id='tooltip-vcc-4' class='tooltip cond-tooltop' role='tooltip-vcc-4'>Op Count: 0</span><div id='vdc-4' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-4'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-vdc-4' class='tooltip cond-tooltop' role='tooltip-vdc-4'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-4'><div class='box center AllocateBox'><div class='box-header'><button class='icon-btn' id='viz-3521-hide' onclick='return toggleViz("viz-3521");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-3521-show' style = 'display:none;' onclick='return toggleViz("viz-3521");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("allocate-3521")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='allocate-viz-3521'>Allocate: kernel</span></div><div class='viz-cost-btns'><div id='vcc-3521' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-3521'   line-cost='0' block-cost='18'   line-cost-color='0' block-cost-color='18'></div><span id='tooltip-vcc-3521' class='tooltip cond-tooltop' role='tooltip-vcc-3521'>Op Count: 0</span><div id='vdc-3521' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-3521'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-vdc-3521' class='tooltip cond-tooltop' role='tooltip-vdc-3521'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-3521'><table class='allocate-table'><tr><th scope='col'>Memory Type</th><td>Auto</td></tr><tr><th scope='col'>Data Type</th><td>float32</td></tr><tr><th scope='col'>Dim-0</th><td><span class='IntImm Imm' id='0-0'>20</span></td></tr></table><div class='box center ProducerBox'><div class='box-header'><button class='icon-btn' id='viz-3541-hide' onclick='return toggleViz("viz-3541");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-3541-show' style = 'display:none;' onclick='return toggleViz("viz-3541");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("prodcons-3541")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='prodcons-viz-3541'>Produce: <b class='variable matched'>kernel</b></span></div><div class='viz-cost-btns'><div id='vcc-3541' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-3541'   line-cost='0' block-cost='3'   line-cost-color='0' block-cost-color='3'></div><span id='tooltip-vcc-3541' class='tooltip cond-tooltop' role='tooltip-vcc-3541'>Op Count: 0</span><div id='vdc-3541' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-3541'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-vdc-3541' class='tooltip cond-tooltop' role='tooltip-vdc-3541'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-3541'><div class='box center ForBox'><div class='box-header'><button class='icon-btn' id='viz-3553-hide' onclick='return toggleViz("viz-3553");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-3553-show' style = 'display:none;' onclick='return toggleViz("viz-3553");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("loop-3553")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><button class='icon-btn sync-btn' onclick='scrollToAsm("16881")'>  <i class='bi bi-arrow-right-square' title='Jump to assembly'></i></button><div class='box-title'><span class='' id='loop-viz-3553'>For: <b class='variable matched'>kernel.s0.x</b></span></div><div class='viz-cost-btns'><div id='vcc-3553' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-3553'   line-cost='0' block-cost='3'   line-cost-color='0' block-cost-color='3'></div><span id='tooltip-vcc-3553' class='tooltip cond-tooltop' role='tooltip-vcc-3553'>Op Count: 0</span><div id='vdc-3553' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-3553'   line-cost='0' block-cost='32'   line-cost-color='0' block-cost-color='1'></div><span id='tooltip-vdc-3553' class='tooltip cond-tooltop' role='tooltip-vdc-3553'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-3553'><table class='allocate-table'><tr><th scope='col'>Min</th><td><span class='IntImm Imm' id='0-0'>0</span></td></tr><tr><th scope='col'>Extent</th><td><span class='IntImm Imm' id='0-0'>20</span></td></tr></table><div class='box center StoreBox'><div class='box-header'><button class='icon-btn' id='viz-3575-hide' onclick='return toggleViz("viz-3575");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-3575-show' style = 'display:none;' onclick='return toggleViz("viz-3575");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("store-3575")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='store-viz-3575'>Store: <b class='variable matched'>kernel</b></span></div><div class='viz-cost-btns'><div id='vcc-3575' class='cost-btn CostColor3'   aria-describedby='tooltip-vcc-3575'   line-cost='3' block-cost='3'   line-cost-color='3' block-cost-color='3'></div><span id='tooltip-vcc-3575' class='tooltip cond-tooltop' role='tooltip-vcc-3575'>Op Count: 3</span><div id='vdc-3575' class='cost-btn CostColor1'   aria-describedby='tooltip-vdc-3575'   line-cost='32' block-cost='32'   line-cost-color='1' block-cost-color='1'></div><span id='tooltip-vdc-3575' class='tooltip cond-tooltop' role='tooltip-vdc-3575'>Bits Moved: 32</span></div></div><div class='box-body' id='viz-3575'><table class='allocate-table'><tr><th scope='col'>Type</th><td>Scalar</td></tr><tr><th scope='col'>Output</th><td>float32</td></tr></table></div></div></div></div></div></div><div class='box center ProducerBox'><div class='box-header'><button class='icon-btn' id='viz-3690-hide' onclick='return toggleViz("viz-3690");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-3690-show' style = 'display:none;' onclick='return toggleViz("viz-3690");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("prodcons-3690")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='prodcons-viz-3690'>Produce: <b class='variable matched'>conv_y</b></span></div><div class='viz-cost-btns'><div id='vcc-3690' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-3690'   line-cost='0' block-cost='10'   line-cost-color='0' block-cost-color='10'></div><span id='tooltip-vcc-3690' class='tooltip cond-tooltop' role='tooltip-vcc-3690'>Op Count: 0</span><div id='vdc-3690' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-3690'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-vdc-3690' class='tooltip cond-tooltop' role='tooltip-vdc-3690'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-3690'><div class='box center ConsumerBox'><div class='box-header'><button class='icon-btn' id='viz-3702-hide' onclick='return toggleViz("viz-3702");'>  <i class='bi bi-dash-square' title='Collapse block'></i></button><button class='icon-btn' id='viz-3702-show' style = 'display:none;' onclick='return toggleViz("viz-3702");'>  <i class='bi bi-plus-square' title='Expand block'></i></button><button class='icon-btn sync-btn' onclick='scrollToCode("prodcons-3702")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><div class='box-title'><span class='' id='prodcons-viz-3702'>Consume: <b class='variable matched'>kernel</b></span></div><div class='viz-cost-btns'><div id='vcc-3702' class='cost-btn CostColor0'   aria-describedby='tooltip-vcc-3702'   line-cost='0' block-cost='10'   line-cost-color='0' block-cost-color='10'></div><span id='tooltip-vcc-3702' class='tooltip cond-tooltop' role='tooltip-vcc-3702'>Op Count: 0</span><div id='vdc-3702' class='cost-btn CostColor0'   aria-describedby='tooltip-vdc-3702'   line-cost='0' block-cost='0'   line-cost-color='0' block-cost-color='0'></div><span id='tooltip-vdc-3702' class='tooltip cond-tooltop' role='tooltip-vdc-3702'>Bits Moved: 0</span></div></div><div class='box-body' id='viz-3702'><div class='fn-call'><button class='icon-btn sync-btn' onclick='scrollToCode("fn-call-3880")'>  <i class='bi bi-arrow-left-square' title='Jump to code'></i></button><b class='variable matched'>halide_do_par_for</b>(...)</div></div></div></div></div></div></div></div></div></span></li></ul></div></div></div></div>
+<div class='resize-bar' id='resize-bar-2'>
+                       <div class='collapse-btns'>
+                         <div>
+                           <button class='icon-btn resize-btn' onclick='collapseL_visualization_tab()'>
+                             <i class='bi bi-arrow-bar-left' title='Collapse visualization tab'></i>
+                           </button>
+                         </div>
+                         <div>
+                           <button class='icon-btn resize-btn' onclick='collapse_assembly_tab()'>
+                             <i class='bi bi-arrow-bar-right' title='Collapse assembly tab'></i>
+                           </button>
+                         </div>
+                       </div>
+                     </div><div id='assembly-tab'>
+<div id='assemblyContent' style='display: none;'>
+<pre>
+	.text
+	.file	"posix_aligned_alloc.cpp"
+	.section	.text.halide_internal_aligned_alloc,"ax",@progbits
+	.weak	halide_internal_aligned_alloc   # -- Begin function halide_internal_aligned_alloc
+	.p2align	4, 0x90
+	.type	halide_internal_aligned_alloc,@function
+halide_internal_aligned_alloc:          # @halide_internal_aligned_alloc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	leaq	(%rsi,%rdi,2), %rdi
+	decq	%rdi
+	movq	%rbx, %r14
+	negq	%r14
+	andq	%r14, %rdi
+	callq	malloc@PLT
+	testq	%rax, %rax
+	je	.LBB0_1
+# %bb.2:                                # %if.end
+	movq	%rax, %rcx
+	addq	%rbx, %rax
+	addq	$7, %rax
+	andq	%r14, %rax
+	movq	%rcx, -8(%rax)
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.LBB0_1:
+	xorl	%eax, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end0:
+	.size	halide_internal_aligned_alloc, .Lfunc_end0-halide_internal_aligned_alloc
+                                        # -- End function
+	.section	.text.halide_internal_aligned_free,"ax",@progbits
+	.weak	halide_internal_aligned_free    # -- Begin function halide_internal_aligned_free
+	.p2align	4, 0x90
+	.type	halide_internal_aligned_free,@function
+halide_internal_aligned_free:           # @halide_internal_aligned_free
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	-8(%rdi), %rdi
+	popq	%rbp
+	jmp	free@PLT                        # TAILCALL
+.Lfunc_end1:
+	.size	halide_internal_aligned_free, .Lfunc_end1-halide_internal_aligned_free
+                                        # -- End function
+	.section	.text.halide_default_malloc,"ax",@progbits
+	.weak	halide_default_malloc           # -- Begin function halide_default_malloc
+	.p2align	4, 0x90
+	.type	halide_default_malloc,@function
+halide_default_malloc:                  # @halide_default_malloc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	leaq	127(%rsi), %rdi
+	andq	$-64, %rdi
+	callq	malloc@PLT
+	testq	%rax, %rax
+	je	.LBB2_1
+# %bb.2:                                # %if.end.i
+	movq	%rax, %rcx
+	addq	$71, %rax
+	andq	$-64, %rax
+	movq	%rcx, -8(%rax)
+	popq	%rbp
+	retq
+.LBB2_1:
+	xorl	%eax, %eax
+	popq	%rbp
+	retq
+.Lfunc_end2:
+	.size	halide_default_malloc, .Lfunc_end2-halide_default_malloc
+                                        # -- End function
+	.section	.text.halide_default_free,"ax",@progbits
+	.weak	halide_default_free             # -- Begin function halide_default_free
+	.p2align	4, 0x90
+	.type	halide_default_free,@function
+halide_default_free:                    # @halide_default_free
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	-8(%rsi), %rdi
+	popq	%rbp
+	jmp	free@PLT                        # TAILCALL
+.Lfunc_end3:
+	.size	halide_default_free, .Lfunc_end3-halide_default_free
+                                        # -- End function
+	.section	.text.halide_set_custom_malloc,"ax",@progbits
+	.weak	halide_set_custom_malloc        # -- Begin function halide_set_custom_malloc
+	.p2align	4, 0x90
+	.type	halide_set_custom_malloc,@function
+halide_set_custom_malloc:               # @halide_set_custom_malloc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal13custom_mallocE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end4:
+	.size	halide_set_custom_malloc, .Lfunc_end4-halide_set_custom_malloc
+                                        # -- End function
+	.section	.text.halide_set_custom_free,"ax",@progbits
+	.weak	halide_set_custom_free          # -- Begin function halide_set_custom_free
+	.p2align	4, 0x90
+	.type	halide_set_custom_free,@function
+halide_set_custom_free:                 # @halide_set_custom_free
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal11custom_freeE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end5:
+	.size	halide_set_custom_free, .Lfunc_end5-halide_set_custom_free
+                                        # -- End function
+	.section	.text.halide_malloc,"ax",@progbits
+	.weak	halide_malloc                   # -- Begin function halide_malloc
+	.p2align	4, 0x90
+	.type	halide_malloc,@function
+halide_malloc:                          # @halide_malloc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal13custom_mallocE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end6:
+	.size	halide_malloc, .Lfunc_end6-halide_malloc
+                                        # -- End function
+	.section	.text.halide_free,"ax",@progbits
+	.weak	halide_free                     # -- Begin function halide_free
+	.p2align	4, 0x90
+	.type	halide_free,@function
+halide_free:                            # @halide_free
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal11custom_freeE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end7:
+	.size	halide_free, .Lfunc_end7-halide_free
+                                        # -- End function
+	.section	.text.halide_default_error,"ax",@progbits
+	.weak	halide_default_error            # -- Begin function halide_default_error
+	.p2align	4, 0x90
+	.type	halide_default_error,@function
+halide_default_error:                   # @halide_default_error
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	subq	$4104, %rsp                     # imm = 0x1008
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	leaq	-26(%rbp), %rsi
+	leaq	.L.str(%rip), %rdx
+	leaq	-4120(%rbp), %r14
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	leaq	4094(%rax), %rsi
+	movq	%rax, %rdi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	cmpb	$10, -1(%rax)
+	je	.LBB8_2
+# %bb.1:                                # %if.then
+	movw	$10, (%rax)
+	incq	%rax
+.LBB8_2:                                # %if.end
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	addq	$4104, %rsp                     # imm = 0x1008
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end8:
+	.size	halide_default_error, .Lfunc_end8-halide_default_error
+                                        # -- End function
+	.section	.text.halide_error,"ax",@progbits
+	.weak	halide_error                    # -- Begin function halide_error
+	.p2align	4, 0x90
+	.type	halide_error,@function
+halide_error:                           # @halide_error
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal13error_handlerE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end9:
+	.size	halide_error, .Lfunc_end9-halide_error
+                                        # -- End function
+	.section	.text.halide_set_error_handler,"ax",@progbits
+	.weak	halide_set_error_handler        # -- Begin function halide_set_error_handler
+	.p2align	4, 0x90
+	.type	halide_set_error_handler,@function
+halide_set_error_handler:               # @halide_set_error_handler
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal13error_handlerE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end10:
+	.size	halide_set_error_handler, .Lfunc_end10-halide_set_error_handler
+                                        # -- End function
+	.section	.text.halide_print,"ax",@progbits
+	.weak	halide_print                    # -- Begin function halide_print
+	.p2align	4, 0x90
+	.type	halide_print,@function
+halide_print:                           # @halide_print
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal12custom_printE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end11:
+	.size	halide_print, .Lfunc_end11-halide_print
+                                        # -- End function
+	.section	.text.halide_set_custom_print,"ax",@progbits
+	.weak	halide_set_custom_print         # -- Begin function halide_set_custom_print
+	.p2align	4, 0x90
+	.type	halide_set_custom_print,@function
+halide_set_custom_print:                # @halide_set_custom_print
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal12custom_printE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end12:
+	.size	halide_set_custom_print, .Lfunc_end12-halide_set_custom_print
+                                        # -- End function
+	.section	.text.halide_start_clock,"ax",@progbits
+	.weak	halide_start_clock              # -- Begin function halide_start_clock
+	.p2align	4, 0x90
+	.type	halide_start_clock,@function
+halide_start_clock:                     # @halide_start_clock
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%rbx
+	pushq	%rax
+	movq	halide_reference_clock_inited@GOTPCREL(%rip), %rbx
+	cmpb	$0, (%rbx)
+	jne	.LBB13_2
+# %bb.1:                                # %if.then
+	movq	halide_reference_clock@GOTPCREL(%rip), %rdx
+	movl	$228, %edi
+	xorl	%esi, %esi
+	xorl	%eax, %eax
+	callq	syscall@PLT
+	movb	$1, (%rbx)
+.LBB13_2:                               # %if.end
+	xorl	%eax, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%rbp
+	retq
+.Lfunc_end13:
+	.size	halide_start_clock, .Lfunc_end13-halide_start_clock
+                                        # -- End function
+	.section	.text.halide_current_time_ns,"ax",@progbits
+	.weak	halide_current_time_ns          # -- Begin function halide_current_time_ns
+	.p2align	4, 0x90
+	.type	halide_current_time_ns,@function
+halide_current_time_ns:                 # @halide_current_time_ns
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$16, %rsp
+	leaq	-16(%rbp), %rdx
+	movl	$228, %edi
+	xorl	%esi, %esi
+	xorl	%eax, %eax
+	callq	syscall@PLT
+	vmovdqa	-16(%rbp), %xmm0
+	movq	halide_reference_clock@GOTPCREL(%rip), %rax
+	vpsubq	(%rax), %xmm0, %xmm0
+	vmovq	%xmm0, %rax
+	imulq	$1000000000, %rax, %rcx         # imm = 0x3B9ACA00
+	vpextrq	$1, %xmm0, %rax
+	addq	%rcx, %rax
+	addq	$16, %rsp
+	popq	%rbp
+	retq
+.Lfunc_end14:
+	.size	halide_current_time_ns, .Lfunc_end14-halide_current_time_ns
+                                        # -- End function
+	.section	.text.halide_sleep_ms,"ax",@progbits
+	.weak	halide_sleep_ms                 # -- Begin function halide_sleep_ms
+	.p2align	4, 0x90
+	.type	halide_sleep_ms,@function
+halide_sleep_ms:                        # @halide_sleep_ms
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	imull	$1000, %esi, %edi               # imm = 0x3E8
+	popq	%rbp
+	jmp	usleep@PLT                      # TAILCALL
+.Lfunc_end15:
+	.size	halide_sleep_ms, .Lfunc_end15-halide_sleep_ms
+                                        # -- End function
+	.section	.text.halide_default_print,"ax",@progbits
+	.weak	halide_default_print            # -- Begin function halide_default_print
+	.p2align	4, 0x90
+	.type	halide_default_print,@function
+halide_default_print:                   # @halide_default_print
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%rbx
+	pushq	%rax
+	movq	%rsi, %rbx
+	movq	%rsi, %rdi
+	callq	strlen@PLT
+	movl	$1, %edi
+	movq	%rbx, %rsi
+	movq	%rax, %rdx
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%rbp
+	jmp	write@PLT                       # TAILCALL
+.Lfunc_end16:
+	.size	halide_default_print, .Lfunc_end16-halide_default_print
+                                        # -- End function
+	.section	.text.halide_host_cpu_count,"ax",@progbits
+	.weak	halide_host_cpu_count           # -- Begin function halide_host_cpu_count
+	.p2align	4, 0x90
+	.type	halide_host_cpu_count,@function
+halide_host_cpu_count:                  # @halide_host_cpu_count
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movl	$84, %edi
+	popq	%rbp
+	jmp	sysconf@PLT                     # TAILCALL
+.Lfunc_end17:
+	.size	halide_host_cpu_count, .Lfunc_end17-halide_host_cpu_count
+                                        # -- End function
+	.section	.text.halide_thread_yield,"ax",@progbits
+	.weak	halide_thread_yield             # -- Begin function halide_thread_yield
+	.p2align	4, 0x90
+	.type	halide_thread_yield,@function
+halide_thread_yield:                    # @halide_thread_yield
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	jmp	sched_yield@PLT                 # TAILCALL
+.Lfunc_end18:
+	.size	halide_thread_yield, .Lfunc_end18-halide_thread_yield
+                                        # -- End function
+	.section	.text.halide_default_do_task,"ax",@progbits
+	.weak	halide_default_do_task          # -- Begin function halide_default_do_task
+	.p2align	4, 0x90
+	.type	halide_default_do_task,@function
+halide_default_do_task:                 # @halide_default_do_task
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	%rsi, %rax
+	movl	%edx, %esi
+	movq	%rcx, %rdx
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end19:
+	.size	halide_default_do_task, .Lfunc_end19-halide_default_do_task
+                                        # -- End function
+	.section	.text.halide_default_do_loop_task,"ax",@progbits
+	.weak	halide_default_do_loop_task     # -- Begin function halide_default_do_loop_task
+	.p2align	4, 0x90
+	.type	halide_default_do_loop_task,@function
+halide_default_do_loop_task:            # @halide_default_do_loop_task
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	%rsi, %rax
+	movl	%edx, %esi
+	movl	%ecx, %edx
+	movq	%r8, %rcx
+	movq	%r9, %r8
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end20:
+	.size	halide_default_do_loop_task, .Lfunc_end20-halide_default_do_loop_task
+                                        # -- End function
+	.section	.text.halide_default_do_par_for,"ax",@progbits
+	.weak	halide_default_do_par_for       # -- Begin function halide_default_do_par_for
+	.p2align	4, 0x90
+	.type	halide_default_do_par_for,@function
+halide_default_do_par_for:              # @halide_default_do_par_for
+# %bb.0:                                # %entry
+	testl	%ecx, %ecx
+	jle	.LBB21_1
+# %bb.2:                                # %if.end
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	subq	$128, %rsp
+	movq	$0, -144(%rbp)
+	movl	%edx, -108(%rbp)
+	movl	%ecx, -104(%rbp)
+	movb	$0, -96(%rbp)
+	movl	$0, -112(%rbp)
+	movq	%r8, -136(%rbp)
+	movl	$0, -100(%rbp)
+	vxorps	%xmm0, %xmm0, %xmm0
+	vmovups	%xmm0, -128(%rbp)
+	movq	%rsi, -88(%rbp)
+	movq	%rdi, -40(%rbp)
+	movq	$0, -32(%rbp)
+	movl	$0, -24(%rbp)
+	movb	$0, -20(%rbp)
+	leaq	-144(%rbp), %rbx
+	movq	%rbx, -72(%rbp)
+	movl	$0, -64(%rbp)
+	movq	$0, -56(%rbp)
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %r14
+	movq	%r14, %rdi
+	callq	halide_mutex_lock@PLT
+	movl	$1, %edi
+	movq	%rbx, %rsi
+	xorl	%edx, %edx
+	callq	_ZN6Halide7Runtime8Internal27enqueue_work_already_lockedEiPNS1_4workES3_@PLT
+	movq	%rbx, %rdi
+	callq	_ZN6Halide7Runtime8Internal28worker_thread_already_lockedEPNS1_4workE@PLT
+	movq	%r14, %rdi
+	callq	halide_mutex_unlock@PLT
+	movl	-28(%rbp), %eax
+	addq	$128, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.LBB21_1:
+	xorl	%eax, %eax
+	retq
+.Lfunc_end21:
+	.size	halide_default_do_par_for, .Lfunc_end21-halide_default_do_par_for
+                                        # -- End function
+	.section	.text.halide_mutex_lock,"ax",@progbits
+	.weak	halide_mutex_lock               # -- Begin function halide_mutex_lock
+	.p2align	4, 0x90
+	.type	halide_mutex_lock,@function
+halide_mutex_lock:                      # @halide_mutex_lock
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	subq	$16, %rsp
+	movl	$1, %ecx
+	xorl	%eax, %eax
+	lock		cmpxchgq	%rcx, (%rdi)
+	jne	.LBB22_1
+.LBB22_4:                               # %_ZN6Halide7Runtime8Internal15Synchronization10fast_mutex4lockEv.exit
+	addq	$16, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB22_1:                               # %if.then.i
+	movq	%rdi, %rbx
+	movq	(%rdi), %rax
+	movl	$40, %r12d
+	movq	_ZTVN6Halide7Runtime8Internal15Synchronization21mutex_parking_controlE@GOTPCREL(%rip), %r15
+	addq	$16, %r15
+	leaq	-48(%rbp), %r14
+	.p2align	4, 0x90
+.LBB22_2:                               # %while.cond.outer.i.i
+                                        # =>This Inner Loop Header: Depth=1
+	testb	$1, %al
+	jne	.LBB22_5
+# %bb.3:                                # %if.then.i.i
+                                        #   in Loop: Header=BB22_2 Depth=1
+	movq	%rax, %rcx
+	orq	$1, %rcx
+	lock		cmpxchgq	%rcx, (%rbx)
+	jne	.LBB22_2
+	jmp	.LBB22_4
+.LBB22_5:                               # %if.end4.i.i
+                                        #   in Loop: Header=BB22_2 Depth=1
+	testl	%r12d, %r12d
+	jg	.LBB22_6
+# %bb.8:                                # %if.end8.i.i
+                                        #   in Loop: Header=BB22_2 Depth=1
+	testb	$2, %al
+	jne	.LBB22_10
+.LBB22_9:                               # %if.then10.i.i
+                                        #   in Loop: Header=BB22_2 Depth=1
+	movq	%rax, %rcx
+	orq	$2, %rcx
+	lock		cmpxchgq	%rcx, (%rbx)
+	jne	.LBB22_2
+	jmp	.LBB22_10
+.LBB22_6:                               # %_ZN6Halide7Runtime8Internal15Synchronization12spin_control11should_spinEv.exit.i.i
+                                        #   in Loop: Header=BB22_2 Depth=1
+	decl	%r12d
+	je	.LBB22_7
+# %bb.12:                               # %if.then6.i.i
+                                        #   in Loop: Header=BB22_2 Depth=1
+	callq	halide_thread_yield@PLT
+	movq	(%rbx), %rax
+	jmp	.LBB22_2
+.LBB22_7:                               #   in Loop: Header=BB22_2 Depth=1
+	xorl	%r12d, %r12d
+	testb	$2, %al
+	je	.LBB22_9
+.LBB22_10:                              # %if.end19.i.i
+                                        #   in Loop: Header=BB22_2 Depth=1
+	movq	%r15, -48(%rbp)
+	movq	%rbx, -40(%rbp)
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization15parking_control4parkEy@PLT
+	cmpq	%rbx, %rax
+	je	.LBB22_4
+# %bb.11:                               # %if.end24.i.i
+                                        #   in Loop: Header=BB22_2 Depth=1
+	movq	(%rbx), %rax
+	movl	$40, %r12d
+	jmp	.LBB22_2
+.Lfunc_end22:
+	.size	halide_mutex_lock, .Lfunc_end22-halide_mutex_lock
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal27enqueue_work_already_lockedEiPNS1_4workES3_,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal27enqueue_work_already_lockedEiPNS1_4workES3_ # -- Begin function _ZN6Halide7Runtime8Internal27enqueue_work_already_lockedEiPNS1_4workES3_
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal27enqueue_work_already_lockedEiPNS1_4workES3_,@function
+_ZN6Halide7Runtime8Internal27enqueue_work_already_lockedEiPNS1_4workES3_: # @_ZN6Halide7Runtime8Internal27enqueue_work_already_lockedEiPNS1_4workES3_
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$40, %rsp
+	movq	%rsi, %r14
+	movl	%edi, %ebx
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %r8
+	cmpb	$0, 2121(%r8)
+	movq	%rdx, -64(%rbp)                 # 8-byte Spill
+	je	.LBB23_1
+# %bb.9:                                # %if.end4
+	movl	%ebx, %r15d
+	movl	%ebx, -48(%rbp)                 # 4-byte Spill
+	testl	%ebx, %ebx
+	jle	.LBB23_10
+.LBB23_20:                              # %for.body.preheader
+	movq	%r15, %rax
+	shlq	$7, %rax
+	movl	$-1, %r10d
+	xorl	%ecx, %ecx
+	movl	$1, %edi
+	xorl	%r12d, %r12d
+	xorl	%r9d, %r9d
+	xorl	%ebx, %ebx
+	xorl	%r13d, %r13d
+	jmp	.LBB23_21
+	.p2align	4, 0x90
+.LBB23_22:                              # %if.then23
+                                        #   in Loop: Header=BB23_21 Depth=1
+	incl	%r10d
+.LBB23_24:                              # %for.inc
+                                        #   in Loop: Header=BB23_21 Depth=1
+	addl	%esi, %r12d
+	subq	$-128, %rcx
+	cmpq	%rcx, %rax
+	je	.LBB23_11
+.LBB23_21:                              # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	movl	44(%r14,%rcx), %esi
+	testl	%esi, %esi
+	movzbl	%r13b, %r13d
+	cmovel	%edi, %r13d
+	movzbl	%r9b, %r9d
+	cmovnel	%edi, %r9d
+	cmpl	$0, 32(%r14,%rcx)
+	movzbl	%bl, %ebx
+	cmovnel	%edi, %ebx
+	cmpb	$0, 48(%r14,%rcx)
+	jne	.LBB23_22
+# %bb.23:                               # %if.else24
+                                        #   in Loop: Header=BB23_21 Depth=1
+	addl	40(%r14,%rcx), %r10d
+	jmp	.LBB23_24
+.LBB23_11:                              # %for.cond.cleanup.loopexit
+	andb	$1, %bl
+	andb	$1, %r9b
+	andb	$1, %r13b
+	testq	%rdx, %rdx
+	movl	%r9d, -52(%rbp)                 # 4-byte Spill
+	je	.LBB23_13
+.LBB23_25:                              # %do.body61
+	movl	112(%rdx), %eax
+	imull	44(%rdx), %eax
+	subl	96(%rdx), %eax
+	cmpl	%eax, %r12d
+	jle	.LBB23_27
+# %bb.26:                               # %if.then66
+	leaq	.L.str.3(%rip), %rsi
+	xorl	%edi, %edi
+	movl	%r10d, %r12d
+	callq	halide_print@PLT
+	callq	abort@PLT
+	movl	%r12d, %r10d
+	movl	-52(%rbp), %r9d                 # 4-byte Reload
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %r8
+	movq	-64(%rbp), %rdx                 # 8-byte Reload
+.LBB23_27:                              # %do.end69
+	movl	%ebx, %eax
+	orb	%r9b, %al
+	movl	-48(%rbp), %esi                 # 4-byte Reload
+	je	.LBB23_29
+# %bb.28:                               # %if.then73
+	incl	96(%rdx)
+	jmp	.LBB23_29
+.LBB23_1:                               # %land.rhs.i.preheader
+	leaq	12(%r8), %rax
+	movl	$2128, %ecx                     # imm = 0x850
+	addq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rcx
+	.p2align	4, 0x90
+.LBB23_2:                               # %land.rhs.i
+                                        # =>This Inner Loop Header: Depth=1
+	cmpb	$0, (%rax)
+	jne	.LBB23_4
+# %bb.3:                                # %while.body.i
+                                        #   in Loop: Header=BB23_2 Depth=1
+	incq	%rax
+	cmpq	%rcx, %rax
+	jb	.LBB23_2
+.LBB23_4:                               # %do.body.i
+	movl	$2128, %ecx                     # imm = 0x850
+	addq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rcx
+	cmpq	%rcx, %rax
+	je	.LBB23_6
+# %bb.5:                                # %if.then.i
+	leaq	.L.str.6(%rip), %rsi
+	xorl	%edi, %edi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %r8
+	movq	-64(%rbp), %rdx                 # 8-byte Reload
+.LBB23_6:                               # %_ZNK6Halide7Runtime8Internal12work_queue_t13assert_zeroedEv.exit
+	movl	8(%r8), %eax
+	testl	%eax, %eax
+	jne	.LBB23_8
+# %bb.7:                                # %if.then2
+	callq	_ZN6Halide7Runtime8Internal27default_desired_num_threadsEv@PLT
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %r8
+	movq	-64(%rbp), %rdx                 # 8-byte Reload
+.LBB23_8:                               # %if.end
+	cmpl	$2, %eax
+	movl	$1, %ecx
+	cmovgel	%eax, %ecx
+	cmpl	$256, %ecx                      # imm = 0x100
+	movl	$256, %eax                      # imm = 0x100
+	cmovll	%ecx, %eax
+	movl	%eax, 8(%r8)
+	movb	$1, 2121(%r8)
+	movl	%ebx, %r15d
+	movl	%ebx, -48(%rbp)                 # 4-byte Spill
+	testl	%ebx, %ebx
+	jg	.LBB23_20
+.LBB23_10:
+	xorl	%r13d, %r13d
+	movl	$-1, %r10d
+	xorl	%ebx, %ebx
+	xorl	%r9d, %r9d
+	xorl	%r12d, %r12d
+	testq	%rdx, %rdx
+	movl	%r9d, -52(%rbp)                 # 4-byte Spill
+	jne	.LBB23_25
+.LBB23_13:                              # %if.then32
+	movl	%r10d, -68(%rbp)                # 4-byte Spill
+	movl	%ebx, %eax
+	orb	%r9b, %al
+	setne	-41(%rbp)                       # 1-byte Folded Spill
+	movl	24(%r8), %ecx
+	cmpl	$255, %ecx
+	jg	.LBB23_18
+# %bb.14:                               # %land.rhs.preheader
+	movzbl	%al, %eax
+	addl	%eax, %r12d
+	jmp	.LBB23_15
+	.p2align	4, 0x90
+.LBB23_17:                              # %while.body
+                                        #   in Loop: Header=BB23_15 Depth=1
+	incl	28(%r8)
+	movq	_ZN6Halide7Runtime8Internal13worker_threadEPv@GOTPCREL(%rip), %rdi
+	xorl	%esi, %esi
+	callq	halide_spawn_thread@PLT
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %r8
+	movslq	24(%r8), %rdx
+	leal	1(%rdx), %ecx
+	movl	%ecx, 24(%r8)
+	movq	%rax, 72(%r8,%rdx,8)
+	cmpq	$255, %rdx
+	jge	.LBB23_18
+.LBB23_15:                              # %land.rhs
+                                        # =>This Inner Loop Header: Depth=1
+	movl	8(%r8), %eax
+	decl	%eax
+	cmpl	%eax, %ecx
+	jl	.LBB23_17
+# %bb.16:                               # %lor.rhs
+                                        #   in Loop: Header=BB23_15 Depth=1
+	subl	2124(%r8), %ecx
+	incl	%ecx
+	cmpl	%r12d, %ecx
+	jl	.LBB23_17
+.LBB23_18:                              # %do.end50
+	cmpb	$0, -41(%rbp)                   # 1-byte Folded Reload
+	movl	-48(%rbp), %esi                 # 4-byte Reload
+	movl	-68(%rbp), %r10d                # 4-byte Reload
+	je	.LBB23_29
+# %bb.19:                               # %if.then54
+	incl	2124(%r8)
+.LBB23_29:                              # %if.end77
+	testl	%esi, %esi
+	jle	.LBB23_33
+# %bb.30:                               # %for.body83.preheader
+	movq	16(%r8), %rax
+	incq	%r15
+	.p2align	4, 0x90
+.LBB23_31:                              # %for.body83
+                                        # =>This Inner Loop Header: Depth=1
+	leal	-2(%r15), %ecx
+	shlq	$7, %rcx
+	movq	%rax, 64(%r14,%rcx)
+	leaq	(%r14,%rcx), %rax
+	movq	%r14, 72(%r14,%rcx)
+	movl	%esi, 80(%r14,%rcx)
+	movl	$0, 96(%r14,%rcx)
+	decq	%r15
+	cmpq	$1, %r15
+	ja	.LBB23_31
+# %bb.32:                               # %for.cond80.for.cond.cleanup82_crit_edge
+	movq	%r14, 16(%r8)
+.LBB23_33:                              # %for.cond.cleanup82
+	movl	24(%r8), %eax
+	movl	64(%r8), %ecx
+	cmpl	%ecx, %r10d
+	cmovgl	%eax, %r10d
+	cmpl	%eax, %ecx
+	cmovll	%eax, %r10d
+	cmpl	$0, 68(%r8)
+	cmovnel	%eax, %r10d
+	movl	%r10d, 32(%r8)
+	leaq	40(%r8), %rdi
+	callq	halide_cond_broadcast@PLT
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rcx
+	movl	32(%rcx), %eax
+	cmpl	28(%rcx), %eax
+	jle	.LBB23_36
+# %bb.34:                               # %if.then107
+	leaq	48(%rcx), %rdi
+	callq	halide_cond_broadcast@PLT
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rcx
+	testb	%r13b, %r13b
+	je	.LBB23_36
+# %bb.35:                               # %if.then109
+	leaq	56(%rcx), %rdi
+	callq	halide_cond_broadcast@PLT
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rcx
+.LBB23_36:                              # %if.end111
+	orb	-52(%rbp), %bl                  # 1-byte Folded Reload
+	je	.LBB23_40
+# %bb.37:                               # %if.then115
+	movq	-64(%rbp), %rax                 # 8-byte Reload
+	testq	%rax, %rax
+	je	.LBB23_39
+# %bb.38:                               # %if.then117
+	decl	96(%rax)
+	jmp	.LBB23_40
+.LBB23_39:                              # %if.else120
+	decl	2124(%rcx)
+.LBB23_40:                              # %if.end123
+	addq	$40, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end23:
+	.size	_ZN6Halide7Runtime8Internal27enqueue_work_already_lockedEiPNS1_4workES3_, .Lfunc_end23-_ZN6Halide7Runtime8Internal27enqueue_work_already_lockedEiPNS1_4workES3_
+                                        # -- End function
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4, 0x0                          # -- Begin function _ZN6Halide7Runtime8Internal28worker_thread_already_lockedEPNS1_4workE
+.LCPI24_0:
+	.long	1                               # 0x1
+	.long	4294967295                      # 0xffffffff
+	.zero	4
+	.zero	4
+	.section	.text._ZN6Halide7Runtime8Internal28worker_thread_already_lockedEPNS1_4workE,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal28worker_thread_already_lockedEPNS1_4workE
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal28worker_thread_already_lockedEPNS1_4workE,@function
+_ZN6Halide7Runtime8Internal28worker_thread_already_lockedEPNS1_4workE: # @_ZN6Halide7Runtime8Internal28worker_thread_already_lockedEPNS1_4workE
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$88, %rsp
+	movq	%rdi, %rbx
+	xorl	%r14d, %r14d
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %r13
+	leaq	48(%r13), %rax
+	movq	%rax, -88(%rbp)                 # 8-byte Spill
+	leaq	40(%r13), %rax
+	movq	%rax, -80(%rbp)                 # 8-byte Spill
+	leaq	56(%r13), %rax
+	movq	%rax, -64(%rbp)                 # 8-byte Spill
+	leaq	16(%r13), %rax
+	movq	%rax, -104(%rbp)                # 8-byte Spill
+	jmp	.LBB24_1
+.LBB24_90:                              # %land.lhs.true307
+                                        #   in Loop: Header=BB24_1 Depth=1
+	cmpb	$0, 124(%r12)
+	movl	$0, %r14d
+	je	.LBB24_1
+	.p2align	4, 0x90
+.LBB24_91:                              # %if.then310
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	-64(%rbp), %rdi                 # 8-byte Reload
+	callq	halide_cond_broadcast@PLT
+	xorl	%r14d, %r14d
+.LBB24_1:                               # %while.cond
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB24_13 Depth 2
+                                        #     Child Loop BB24_11 Depth 2
+                                        #       Child Loop BB24_30 Depth 3
+                                        #     Child Loop BB24_51 Depth 2
+                                        #       Child Loop BB24_53 Depth 3
+                                        #         Child Loop BB24_54 Depth 4
+                                        #     Child Loop BB24_77 Depth 2
+	testq	%rbx, %rbx
+	je	.LBB24_7
+# %bb.2:                                # %cond.true
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movl	112(%rbx), %eax
+	movl	40(%rbx), %ecx
+	orl	%eax, %ecx
+	je	.LBB24_92
+# %bb.3:                                # %if.then
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	16(%r13), %r12
+	cmpl	$0, 116(%rbx)
+	je	.LBB24_16
+# %bb.4:                                # %if.then3
+                                        #   in Loop: Header=BB24_1 Depth=1
+	testl	%eax, %eax
+	jne	.LBB24_9
+# %bb.5:                                # %while.cond6.preheader
+                                        #   in Loop: Header=BB24_1 Depth=1
+	cmpq	%rbx, %r12
+	je	.LBB24_6
+	.p2align	4, 0x90
+.LBB24_13:                              # %while.body8
+                                        #   Parent Loop BB24_1 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	%r12, %rax
+	movq	64(%r12), %r12
+	cmpq	%rbx, %r12
+	jne	.LBB24_13
+# %bb.14:                               # %while.end.loopexit
+                                        #   in Loop: Header=BB24_1 Depth=1
+	addq	$64, %rax
+	jmp	.LBB24_15
+	.p2align	4, 0x90
+.LBB24_7:                               # %cond.end
+                                        #   in Loop: Header=BB24_1 Depth=1
+	cmpb	$0, 2120(%r13)
+	jne	.LBB24_92
+# %bb.8:                                # %while.body.thread
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	16(%r13), %r12
+.LBB24_9:                               # %do.end
+                                        #   in Loop: Header=BB24_1 Depth=1
+	testq	%r12, %r12
+	je	.LBB24_34
+# %bb.10:                               # %do.end27.preheader
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	-104(%rbp), %r15                # 8-byte Reload
+	jmp	.LBB24_11
+	.p2align	4, 0x90
+.LBB24_33:                              # %cleanup
+                                        #   in Loop: Header=BB24_11 Depth=2
+	movq	%r12, %r15
+	movq	64(%r12), %rax
+	addq	$64, %r15
+	movq	%rax, %r12
+	testq	%rax, %rax
+	je	.LBB24_34
+.LBB24_11:                              # %do.end27
+                                        #   Parent Loop BB24_1 Depth=1
+                                        # =>  This Loop Header: Depth=2
+                                        #       Child Loop BB24_30 Depth 3
+	movq	88(%r12), %rax
+	testq	%rax, %rax
+	je	.LBB24_12
+# %bb.19:                               # %if.else32
+                                        #   in Loop: Header=BB24_11 Depth=2
+	movl	44(%rax), %edx
+	movl	112(%rax), %ecx
+	testl	%ecx, %ecx
+	je	.LBB24_21
+# %bb.20:                               # %if.else38
+                                        #   in Loop: Header=BB24_11 Depth=2
+	imull	%ecx, %edx
+.LBB24_21:                              # %if.end45
+                                        #   in Loop: Header=BB24_11 Depth=2
+	subl	96(%rax), %edx
+	jmp	.LBB24_22
+	.p2align	4, 0x90
+.LBB24_12:                              # %if.then31
+                                        #   in Loop: Header=BB24_11 Depth=2
+	movl	24(%r13), %edx
+	subl	2124(%r13), %edx
+	incl	%edx
+.LBB24_22:                              # %if.end45
+                                        #   in Loop: Header=BB24_11 Depth=2
+	movl	44(%r12), %ecx
+	movb	$1, %sil
+	movb	$1, %dil
+	testq	%rbx, %rbx
+	je	.LBB24_24
+# %bb.23:                               # %lor.lhs.false
+                                        #   in Loop: Header=BB24_11 Depth=2
+	movq	72(%r12), %rdi
+	cmpq	72(%rbx), %rdi
+	sete	%r8b
+	testl	%ecx, %ecx
+	sete	%dil
+	orb	%r8b, %dil
+.LBB24_24:                              # %lor.end
+                                        #   in Loop: Header=BB24_11 Depth=2
+	cmpb	$0, 48(%r12)
+	je	.LBB24_26
+# %bb.25:                               # %lor.rhs70
+                                        #   in Loop: Header=BB24_11 Depth=2
+	cmpl	$0, 112(%r12)
+	sete	%sil
+.LBB24_26:                              # %lor.end73
+                                        #   in Loop: Header=BB24_11 Depth=2
+	cmpl	%ecx, %edx
+	jl	.LBB24_33
+# %bb.27:                               # %lor.end73
+                                        #   in Loop: Header=BB24_11 Depth=2
+	xorb	$1, %dil
+	jne	.LBB24_33
+# %bb.28:                               # %lor.end73
+                                        #   in Loop: Header=BB24_11 Depth=2
+	testb	%sil, %sil
+	je	.LBB24_33
+# %bb.29:                               # %if.then86
+                                        #   in Loop: Header=BB24_11 Depth=2
+	movl	120(%r12), %edx
+	cmpl	32(%r12), %edx
+	jge	.LBB24_45
+	.p2align	4, 0x90
+.LBB24_30:                              # %for.body.i
+                                        #   Parent Loop BB24_1 Depth=1
+                                        #     Parent Loop BB24_11 Depth=2
+                                        # =>    This Inner Loop Header: Depth=3
+	movq	24(%r12), %rax
+	movslq	%edx, %rcx
+	shlq	$4, %rcx
+	movq	(%rax,%rcx), %rdi
+	movl	8(%rax,%rcx), %esi
+	callq	halide_default_semaphore_try_acquire@PLT
+	testb	%al, %al
+	je	.LBB24_33
+# %bb.31:                               # %for.inc.i
+                                        #   in Loop: Header=BB24_30 Depth=3
+	movl	120(%r12), %edx
+	incl	%edx
+	movl	%edx, 120(%r12)
+	cmpl	32(%r12), %edx
+	jl	.LBB24_30
+# %bb.32:                               # %if.else127.loopexit
+                                        #   in Loop: Header=BB24_1 Depth=1
+	leaq	88(%r12), %rsi
+	leaq	44(%r12), %rdx
+	movq	88(%r12), %rax
+	movl	44(%r12), %ecx
+	movl	$0, 120(%r12)
+	incl	112(%r12)
+	testq	%rax, %rax
+	je	.LBB24_47
+.LBB24_48:                              # %if.else143
+                                        #   in Loop: Header=BB24_1 Depth=1
+	addl	%ecx, 96(%rax)
+	cmpb	$0, 48(%r12)
+	movq	%rdx, -56(%rbp)                 # 8-byte Spill
+	movq	%rsi, -48(%rbp)                 # 8-byte Spill
+	je	.LBB24_69
+.LBB24_50:                              # %if.then156
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	64(%r12), %rax
+	movq	%rax, (%r15)
+	movq	%r13, %rdi
+	callq	halide_mutex_unlock@PLT
+	xorl	%r14d, %r14d
+	movl	$1, %r15d
+	.p2align	4, 0x90
+.LBB24_51:                              # %while.cond161.preheader
+                                        #   Parent Loop BB24_1 Depth=1
+                                        # =>  This Loop Header: Depth=2
+                                        #       Child Loop BB24_53 Depth 3
+                                        #         Child Loop BB24_54 Depth 4
+	movl	40(%r12), %ecx
+	movl	%ecx, %eax
+	subl	%r14d, %eax
+	cmpl	%r15d, %eax
+	jle	.LBB24_60
+# %bb.52:                               # %land.rhs.preheader
+                                        #   in Loop: Header=BB24_51 Depth=2
+	movl	32(%r12), %eax
+	movl	120(%r12), %edx
+	jmp	.LBB24_53
+	.p2align	4, 0x90
+.LBB24_57:                              # %while.body167
+                                        #   in Loop: Header=BB24_53 Depth=3
+	movl	$0, 120(%r12)
+	incl	%r15d
+	movl	%ecx, %esi
+	subl	%r14d, %esi
+	xorl	%edx, %edx
+	cmpl	%r15d, %esi
+	jle	.LBB24_58
+.LBB24_53:                              # %land.rhs
+                                        #   Parent Loop BB24_1 Depth=1
+                                        #     Parent Loop BB24_51 Depth=2
+                                        # =>    This Loop Header: Depth=3
+                                        #         Child Loop BB24_54 Depth 4
+	cmpl	%eax, %edx
+	jge	.LBB24_57
+	.p2align	4, 0x90
+.LBB24_54:                              # %for.body.i441
+                                        #   Parent Loop BB24_1 Depth=1
+                                        #     Parent Loop BB24_51 Depth=2
+                                        #       Parent Loop BB24_53 Depth=3
+                                        # =>      This Inner Loop Header: Depth=4
+	movq	24(%r12), %rax
+	movslq	%edx, %rcx
+	shlq	$4, %rcx
+	movq	(%rax,%rcx), %rdi
+	movl	8(%rax,%rcx), %esi
+	callq	halide_default_semaphore_try_acquire@PLT
+	testb	%al, %al
+	je	.LBB24_60
+# %bb.55:                               # %for.inc.i444
+                                        #   in Loop: Header=BB24_54 Depth=4
+	movl	32(%r12), %eax
+	movl	120(%r12), %edx
+	incl	%edx
+	movl	%edx, 120(%r12)
+	cmpl	%eax, %edx
+	jl	.LBB24_54
+# %bb.56:                               # %while.body167.loopexit
+                                        #   in Loop: Header=BB24_53 Depth=3
+	movl	40(%r12), %ecx
+	jmp	.LBB24_57
+	.p2align	4, 0x90
+.LBB24_60:                              # %while.end169
+                                        #   in Loop: Header=BB24_51 Depth=2
+	testl	%r15d, %r15d
+	je	.LBB24_61
+.LBB24_58:                              # %if.end172
+                                        #   in Loop: Header=BB24_51 Depth=2
+	movq	104(%r12), %rdi
+	movl	36(%r12), %edx
+	addl	%r14d, %edx
+	movq	(%r12), %rsi
+	movq	8(%r12), %r8
+	movl	%r15d, %ecx
+	movq	%r12, %r9
+	callq	halide_do_loop_task@PLT
+	addl	%r15d, %r14d
+	xorl	%r15d, %r15d
+	testl	%eax, %eax
+	je	.LBB24_51
+# %bb.59:                               #   in Loop: Header=BB24_1 Depth=1
+	movl	%eax, %r13d
+	jmp	.LBB24_62
+	.p2align	4, 0x90
+.LBB24_34:                              # %if.then103
+                                        #   in Loop: Header=BB24_1 Depth=1
+	testq	%rbx, %rbx
+	je	.LBB24_38
+# %bb.35:                               # %if.then105
+                                        #   in Loop: Header=BB24_1 Depth=1
+	leal	1(%r14), %r15d
+	cmpl	$39, %r14d
+	jg	.LBB24_37
+# %bb.36:                               # %if.then107
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	%r13, %rdi
+	callq	halide_mutex_unlock@PLT
+	callq	halide_thread_yield@PLT
+	movq	%r13, %rdi
+	callq	halide_mutex_lock@PLT
+	movl	%r15d, %r14d
+	jmp	.LBB24_1
+	.p2align	4, 0x90
+.LBB24_16:                              # %if.else
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	88(%rbx), %rax
+	testq	%rax, %rax
+	je	.LBB24_9
+# %bb.17:                               # %land.lhs.true
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movl	116(%rax), %eax
+	testl	%eax, %eax
+	je	.LBB24_9
+# %bb.18:                               # %if.then15
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movl	%eax, 116(%rbx)
+	movq	-64(%rbp), %rdi                 # 8-byte Reload
+	callq	halide_cond_broadcast@PLT
+	jmp	.LBB24_1
+.LBB24_61:                              #   in Loop: Header=BB24_1 Depth=1
+	xorl	%r13d, %r13d
+	movb	$1, %r15b
+.LBB24_62:                              # %while.end179
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	addl	%r14d, 36(%r12)
+	movl	40(%r12), %eax
+	subl	%r14d, %eax
+	movl	%eax, 40(%r12)
+	testb	%r15b, %r15b
+	je	.LBB24_63
+# %bb.66:                               # %if.else190
+                                        #   in Loop: Header=BB24_1 Depth=1
+	testl	%eax, %eax
+	jle	.LBB24_67
+# %bb.68:                               # %if.then194
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rcx
+	movq	16(%rcx), %rax
+	movq	%rax, 64(%r12)
+	movq	%r12, 16(%rcx)
+.LBB24_67:                              #   in Loop: Header=BB24_1 Depth=1
+	xorl	%eax, %eax
+	movq	-56(%rbp), %r8                  # 8-byte Reload
+	movq	-48(%rbp), %r9                  # 8-byte Reload
+	movq	(%r9), %rcx
+	movl	(%r8), %edx
+	testq	%rcx, %rcx
+	jne	.LBB24_85
+	jmp	.LBB24_84
+.LBB24_63:                              # %if.end230.thread461
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movl	$0, 40(%r12)
+	movq	-56(%rbp), %r8                  # 8-byte Reload
+	movq	-48(%rbp), %r9                  # 8-byte Reload
+	jmp	.LBB24_64
+.LBB24_38:                              # %if.else112
+                                        #   in Loop: Header=BB24_1 Depth=1
+	incl	64(%r13)
+	movl	28(%r13), %eax
+	cmpl	32(%r13), %eax
+	jle	.LBB24_40
+# %bb.39:                               # %if.then115
+                                        #   in Loop: Header=BB24_1 Depth=1
+	decl	%eax
+	movl	%eax, 28(%r13)
+	movq	-88(%rbp), %rdi                 # 8-byte Reload
+	movq	%r13, %rsi
+	callq	halide_cond_wait@PLT
+	incl	28(%r13)
+	decl	64(%r13)
+	jmp	.LBB24_1
+.LBB24_37:                              # %if.else108
+                                        #   in Loop: Header=BB24_1 Depth=1
+	incl	68(%r13)
+	movb	$1, 124(%rbx)
+	movq	-64(%rbp), %rdi                 # 8-byte Reload
+	movq	%r13, %rsi
+	callq	halide_cond_wait@PLT
+	movb	$0, 124(%rbx)
+	decl	68(%r13)
+	movl	%r15d, %r14d
+	jmp	.LBB24_1
+.LBB24_40:                              # %if.else118
+                                        #   in Loop: Header=BB24_1 Depth=1
+	leal	1(%r14), %r15d
+	cmpl	$39, %r14d
+	jg	.LBB24_42
+# %bb.41:                               # %if.then121
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	%r13, %rdi
+	callq	halide_mutex_unlock@PLT
+	callq	halide_thread_yield@PLT
+	movq	%r13, %rdi
+	callq	halide_mutex_lock@PLT
+	jmp	.LBB24_43
+.LBB24_6:                               #   in Loop: Header=BB24_1 Depth=1
+	leaq	16(%r13), %rax
+.LBB24_15:                              # %while.end
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	64(%rbx), %rcx
+	movq	%rcx, (%rax)
+	movl	$0, 40(%rbx)
+	jmp	.LBB24_1
+.LBB24_42:                              # %if.else122
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	-80(%rbp), %rdi                 # 8-byte Reload
+	movq	%r13, %rsi
+	callq	halide_cond_wait@PLT
+.LBB24_43:                              # %if.end124
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movl	%r15d, %r14d
+	decl	64(%r13)
+	jmp	.LBB24_1
+.LBB24_45:                              # %if.else127.loopexit63
+                                        #   in Loop: Header=BB24_1 Depth=1
+	leaq	88(%r12), %rsi
+	leaq	44(%r12), %rdx
+	movl	$0, 120(%r12)
+	incl	112(%r12)
+	testq	%rax, %rax
+	jne	.LBB24_48
+.LBB24_47:                              # %if.then136
+                                        #   in Loop: Header=BB24_1 Depth=1
+	addl	%ecx, 2124(%r13)
+	cmpb	$0, 48(%r12)
+	movq	%rdx, -56(%rbp)                 # 8-byte Spill
+	movq	%rsi, -48(%rbp)                 # 8-byte Spill
+	jne	.LBB24_50
+.LBB24_69:                              # %if.else198
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	(%r12), %rax
+	movq	%rax, -96(%rbp)                 # 8-byte Spill
+	movq	8(%r12), %rax
+	movq	%rax, -72(%rbp)                 # 8-byte Spill
+	movq	56(%r12), %r13
+	movq	104(%r12), %r14
+	vmovq	36(%r12), %xmm0                 # xmm0 = mem[0],zero
+	vmovdqa	%xmm0, -128(%rbp)               # 16-byte Spill
+	vpaddd	.LCPI24_0(%rip), %xmm0, %xmm0
+	vmovq	%xmm0, 36(%r12)
+	vpextrd	$1, %xmm0, %eax
+	testl	%eax, %eax
+	jne	.LBB24_71
+# %bb.70:                               # %if.then208
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	64(%r12), %rax
+	movq	%rax, (%r15)
+.LBB24_71:                              # %if.end210
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_unlock@PLT
+	vmovdqa	-128(%rbp), %xmm0               # 16-byte Reload
+	vmovd	%xmm0, %edx
+	movq	%r14, %rdi
+	testq	%r13, %r13
+	je	.LBB24_73
+# %bb.72:                               # %if.then212
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	%r13, %rsi
+	movq	-72(%rbp), %rcx                 # 8-byte Reload
+	callq	halide_do_task@PLT
+	jmp	.LBB24_74
+.LBB24_73:                              # %if.else220
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	-96(%rbp), %rsi                 # 8-byte Reload
+	movl	$1, %ecx
+	movq	-72(%rbp), %r8                  # 8-byte Reload
+	movq	%r12, %r9
+	callq	halide_do_loop_task@PLT
+.LBB24_74:                              # %if.end230
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movl	%eax, %r13d
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	testl	%r13d, %r13d
+	movq	-56(%rbp), %r8                  # 8-byte Reload
+	movq	-48(%rbp), %r9                  # 8-byte Reload
+	je	.LBB24_75
+.LBB24_64:                              # %if.then238
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movl	%r13d, 116(%r12)
+	movl	80(%r12), %ecx
+	testl	%ecx, %ecx
+	jle	.LBB24_65
+# %bb.76:                               # %do.end243.lr.ph
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	72(%r12), %rdx
+	shlq	$7, %rcx
+	xorl	%esi, %esi
+	xorl	%eax, %eax
+	jmp	.LBB24_77
+.LBB24_80:                              # %land.rhs254
+                                        #   in Loop: Header=BB24_77 Depth=2
+	movzbl	124(%rdx,%rsi), %edi
+.LBB24_81:                              # %land.end260
+                                        #   in Loop: Header=BB24_77 Depth=2
+	andb	$1, %al
+	orb	%dil, %al
+.LBB24_82:                              # %for.inc
+                                        #   in Loop: Header=BB24_77 Depth=2
+	subq	$-128, %rsi
+	cmpq	%rsi, %rcx
+	je	.LBB24_83
+.LBB24_77:                              # %do.end243
+                                        #   Parent Loop BB24_1 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	cmpl	$0, 116(%rdx,%rsi)
+	jne	.LBB24_82
+# %bb.78:                               # %if.then247
+                                        #   in Loop: Header=BB24_77 Depth=2
+	movl	%r13d, 116(%rdx,%rsi)
+	cmpl	$0, 112(%r12)
+	je	.LBB24_80
+# %bb.79:                               #   in Loop: Header=BB24_77 Depth=2
+	xorl	%edi, %edi
+	jmp	.LBB24_81
+.LBB24_65:                              #   in Loop: Header=BB24_1 Depth=1
+	xorl	%eax, %eax
+	.p2align	4, 0x90
+.LBB24_83:                              # %if.end271
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	(%r9), %rcx
+	movl	(%r8), %edx
+	testq	%rcx, %rcx
+	je	.LBB24_84
+.LBB24_85:                              # %if.else281
+                                        #   in Loop: Header=BB24_1 Depth=1
+	subl	%edx, 96(%rcx)
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %r13
+	jmp	.LBB24_86
+.LBB24_75:                              #   in Loop: Header=BB24_1 Depth=1
+	xorl	%eax, %eax
+	movq	(%r9), %rcx
+	movl	(%r8), %edx
+	testq	%rcx, %rcx
+	jne	.LBB24_85
+.LBB24_84:                              # %if.then274
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %r13
+	subl	%edx, 2124(%r13)
+.LBB24_86:                              # %if.end290
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movl	112(%r12), %ecx
+	decl	%ecx
+	movl	%ecx, 112(%r12)
+	testb	$1, %al
+	jne	.LBB24_91
+# %bb.87:                               # %lor.lhs.false297
+                                        #   in Loop: Header=BB24_1 Depth=1
+	movl	$0, %r14d
+	testl	%ecx, %ecx
+	jne	.LBB24_1
+# %bb.88:                               # %land.lhs.true300
+                                        #   in Loop: Header=BB24_1 Depth=1
+	cmpl	$0, 40(%r12)
+	je	.LBB24_90
+# %bb.89:                               # %lor.lhs.false304
+                                        #   in Loop: Header=BB24_1 Depth=1
+	cmpl	$0, 116(%r12)
+	movl	$0, %r14d
+	jne	.LBB24_90
+	jmp	.LBB24_1
+.LBB24_92:                              # %while.end316
+	addq	$88, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end24:
+	.size	_ZN6Halide7Runtime8Internal28worker_thread_already_lockedEPNS1_4workE, .Lfunc_end24-_ZN6Halide7Runtime8Internal28worker_thread_already_lockedEPNS1_4workE
+                                        # -- End function
+	.section	.text.halide_mutex_unlock,"ax",@progbits
+	.weak	halide_mutex_unlock             # -- Begin function halide_mutex_unlock
+	.p2align	4, 0x90
+	.type	halide_mutex_unlock,@function
+halide_mutex_unlock:                    # @halide_mutex_unlock
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$16, %rsp
+	xorl	%ecx, %ecx
+	movl	$1, %eax
+	lock		cmpxchgq	%rcx, (%rdi)
+	je	.LBB25_3
+# %bb.1:                                # %if.then.i
+	movq	%rdi, %rsi
+	xorl	%ecx, %ecx
+	movl	$1, %eax
+	lock		cmpxchgq	%rcx, (%rdi)
+	je	.LBB25_3
+# %bb.2:                                # %if.end.i.i
+	movq	_ZTVN6Halide7Runtime8Internal15Synchronization21mutex_parking_controlE@GOTPCREL(%rip), %rax
+	addq	$16, %rax
+	movq	%rax, -16(%rbp)
+	movq	%rsi, -8(%rbp)
+	leaq	-16(%rbp), %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization15parking_control10unpark_oneEy@PLT
+.LBB25_3:                               # %_ZN6Halide7Runtime8Internal15Synchronization10fast_mutex6unlockEv.exit
+	addq	$16, %rsp
+	popq	%rbp
+	retq
+.Lfunc_end25:
+	.size	halide_mutex_unlock, .Lfunc_end25-halide_mutex_unlock
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization15parking_control10unpark_oneEy,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization15parking_control10unpark_oneEy # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization15parking_control10unpark_oneEy
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization15parking_control10unpark_oneEy,@function
+_ZN6Halide7Runtime8Internal15Synchronization15parking_control10unpark_oneEy: # @_ZN6Halide7Runtime8Internal15Synchronization15parking_control10unpark_oneEy
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$24, %rsp
+	movq	%rsi, %r15
+	movq	%rdi, -56(%rbp)                 # 8-byte Spill
+	movq	%rsi, %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization11lock_bucketEy@PLT
+	movq	%rax, %rbx
+	movq	8(%rax), %r12
+	movq	%rax, %r13
+	addq	$8, %r13
+	xorl	%eax, %eax
+	movq	%rax, -64(%rbp)                 # 8-byte Spill
+                                        # implicit-def: $rax
+                                        # kill: killed $rax
+	jmp	.LBB26_2
+	.p2align	4, 0x90
+.LBB26_1:                               #   in Loop: Header=BB26_2 Depth=1
+	leaq	144(%r12), %r13
+	movq	%r12, -64(%rbp)                 # 8-byte Spill
+	movq	%rax, %r12
+	cmpq	%r15, %r14
+	je	.LBB26_22
+.LBB26_2:                               # %while.cond
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB26_6 Depth 2
+                                        #     Child Loop BB26_12 Depth 2
+	testq	%r12, %r12
+	je	.LBB26_17
+# %bb.3:                                # %while.body
+                                        #   in Loop: Header=BB26_2 Depth=1
+	movq	136(%r12), %r14
+	movq	144(%r12), %rax
+	cmpq	%r15, %r14
+	jne	.LBB26_1
+# %bb.4:                                # %if.then
+                                        #   in Loop: Header=BB26_2 Depth=1
+	movq	%rax, (%r13)
+	cmpq	%r12, 16(%rbx)
+	je	.LBB26_9
+# %bb.5:                                # %while.cond7.preheader
+                                        #   in Loop: Header=BB26_2 Depth=1
+	testq	%rax, %rax
+	je	.LBB26_10
+	.p2align	4, 0x90
+.LBB26_6:                               # %while.body9
+                                        #   Parent Loop BB26_2 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	136(%rax), %rcx
+	movq	144(%rax), %rax
+	testq	%rax, %rax
+	je	.LBB26_8
+# %bb.7:                                # %while.body9
+                                        #   in Loop: Header=BB26_6 Depth=2
+	cmpq	%r15, %rcx
+	jne	.LBB26_6
+.LBB26_8:                               # %if.end.loopexit
+                                        #   in Loop: Header=BB26_2 Depth=1
+	cmpq	%r15, %rcx
+	sete	%al
+	jmp	.LBB26_11
+.LBB26_9:                               # %if.then5
+                                        #   in Loop: Header=BB26_2 Depth=1
+	movq	-64(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, 16(%rbx)
+.LBB26_10:                              #   in Loop: Header=BB26_2 Depth=1
+	xorl	%eax, %eax
+.LBB26_11:                              # %if.end
+                                        #   in Loop: Header=BB26_2 Depth=1
+	movq	-56(%rbp), %rdi                 # 8-byte Reload
+	movq	(%rdi), %rcx
+	movzbl	%al, %edx
+	movl	$1, %esi
+	movq	%rdx, -48(%rbp)                 # 8-byte Spill
+                                        # kill: def $edx killed $edx killed $rdx
+	callq	*16(%rcx)
+	movq	%rax, 152(%r12)
+	movq	%r12, %rdi
+	callq	pthread_mutex_lock@PLT
+	movq	(%rbx), %rax
+	.p2align	4, 0x90
+.LBB26_12:                              # %atomicrmw.start
+                                        #   Parent Loop BB26_2 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	%rax, %rcx
+	andq	$-2, %rcx
+	lock		cmpxchgq	%rcx, (%rbx)
+	jne	.LBB26_12
+# %bb.13:                               # %atomicrmw.end
+                                        #   in Loop: Header=BB26_2 Depth=1
+	cmpq	$4, %rax
+	jb	.LBB26_16
+# %bb.14:                               # %atomicrmw.end
+                                        #   in Loop: Header=BB26_2 Depth=1
+	andl	$2, %eax
+	jne	.LBB26_16
+# %bb.15:                               # %if.then.i
+                                        #   in Loop: Header=BB26_2 Depth=1
+	movq	%rbx, %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv@PLT
+.LBB26_16:                              # %_ZN6Halide7Runtime8Internal15Synchronization9word_lock6unlockEv.exit
+                                        #   in Loop: Header=BB26_2 Depth=1
+	movb	$0, 128(%r12)
+	leaq	64(%r12), %rdi
+	callq	pthread_cond_signal@PLT
+	movq	%r12, %rdi
+	callq	pthread_mutex_unlock@PLT
+	cmpq	%r15, %r14
+	jne	.LBB26_2
+	jmp	.LBB26_22
+.LBB26_17:                              # %while.end22
+	movq	-56(%rbp), %rdi                 # 8-byte Reload
+	movq	(%rdi), %rax
+	xorl	%esi, %esi
+	xorl	%edx, %edx
+	callq	*16(%rax)
+	movq	(%rbx), %rax
+	.p2align	4, 0x90
+.LBB26_18:                              # %atomicrmw.start2
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rax, %rcx
+	andq	$-2, %rcx
+	lock		cmpxchgq	%rcx, (%rbx)
+	jne	.LBB26_18
+# %bb.19:                               # %atomicrmw.end1
+	xorl	%ecx, %ecx
+	movq	%rcx, -48(%rbp)                 # 8-byte Spill
+	cmpq	$4, %rax
+	jb	.LBB26_22
+# %bb.20:                               # %atomicrmw.end1
+	andl	$2, %eax
+	jne	.LBB26_22
+# %bb.21:                               # %if.then.i60
+	movq	%rbx, %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv@PLT
+	xorl	%eax, %eax
+	movq	%rax, -48(%rbp)                 # 8-byte Spill
+.LBB26_22:                              # %cleanup27
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	addq	$24, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end26:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization15parking_control10unpark_oneEy, .Lfunc_end26-_ZN6Halide7Runtime8Internal15Synchronization15parking_control10unpark_oneEy
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization11lock_bucketEy,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization11lock_bucketEy # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization11lock_bucketEy
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization11lock_bucketEy,@function
+_ZN6Halide7Runtime8Internal15Synchronization11lock_bucketEy: # @_ZN6Halide7Runtime8Internal15Synchronization11lock_bucketEy
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%rbx
+	pushq	%rax
+	movabsq	$-7046029254386353131, %rax     # imm = 0x9E3779B97F4A7C15
+	imulq	%rdi, %rax
+	shrq	$54, %rax
+	leaq	(%rax,%rax,2), %rcx
+	movq	_ZN6Halide7Runtime8Internal15Synchronization5tableE@GOTPCREL(%rip), %rdx
+	leaq	(%rdx,%rcx,8), %rbx
+	movl	$1, %esi
+	xorl	%eax, %eax
+	lock		cmpxchgq	%rsi, (%rdx,%rcx,8)
+	je	.LBB27_2
+# %bb.1:                                # %if.then.i
+	movq	%rbx, %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv@PLT
+.LBB27_2:                               # %_ZN6Halide7Runtime8Internal15Synchronization9word_lock4lockEv.exit
+	movq	%rbx, %rax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%rbp
+	retq
+.Lfunc_end27:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization11lock_bucketEy, .Lfunc_end27-_ZN6Halide7Runtime8Internal15Synchronization11lock_bucketEy
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv,@function
+_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv: # @_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rdi, %rbx
+	movq	(%rdi), %r14
+	.p2align	4, 0x90
+.LBB28_1:                               # %while.cond
+                                        # =>This Inner Loop Header: Depth=1
+	cmpq	$4, %r14
+	jb	.LBB28_18
+# %bb.2:                                # %while.cond
+                                        #   in Loop: Header=BB28_1 Depth=1
+	movl	%r14d, %eax
+	andl	$2, %eax
+	jne	.LBB28_18
+# %bb.3:                                # %if.end
+                                        #   in Loop: Header=BB28_1 Depth=1
+	movq	%r14, %rcx
+	orq	$2, %rcx
+	movq	%r14, %rax
+	lock		cmpxchgq	%rcx, (%rbx)
+	movq	%rax, %r14
+	jne	.LBB28_1
+	jmp	.LBB28_4
+	.p2align	4, 0x90
+.LBB28_11:                              #   in Loop: Header=BB28_4 Depth=1
+	movq	%rax, %r14
+	#MEMBARRIER
+	jmp	.LBB28_4
+	.p2align	4, 0x90
+.LBB28_5:                               # %while.body17.preheader
+                                        #   in Loop: Header=BB28_4 Depth=1
+	movq	-48(%rbp), %r15                 # 8-byte Reload
+	jmp	.LBB28_6
+	.p2align	4, 0x90
+.LBB28_8:                               # %do.end
+                                        #   in Loop: Header=BB28_6 Depth=2
+	movq	%r15, 144(%r13)
+	movq	152(%r13), %r12
+	movq	%r13, %r15
+	testq	%r12, %r12
+	jne	.LBB28_9
+.LBB28_6:                               # %while.body17
+                                        #   Parent Loop BB28_4 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	136(%r15), %r13
+	testq	%r13, %r13
+	jne	.LBB28_8
+# %bb.7:                                # %if.then20
+                                        #   in Loop: Header=BB28_6 Depth=2
+	xorl	%edi, %edi
+	leaq	.L.str.5(%rip), %rsi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	jmp	.LBB28_8
+	.p2align	4, 0x90
+.LBB28_4:                               # %while.cond11
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB28_6 Depth 2
+                                        #     Child Loop BB28_13 Depth 2
+	movq	%r14, %rax
+	andq	$-4, %rax
+	movq	%rax, -48(%rbp)                 # 8-byte Spill
+	movq	152(%rax), %r12
+	testq	%r12, %r12
+	je	.LBB28_5
+.LBB28_9:                               # %while.end23
+                                        #   in Loop: Header=BB28_4 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	%r12, 152(%rax)
+	testb	$1, %r14b
+	jne	.LBB28_10
+# %bb.12:                               # %if.end35
+                                        #   in Loop: Header=BB28_4 Depth=1
+	movq	144(%r12), %rax
+	testq	%rax, %rax
+	jne	.LBB28_16
+	.p2align	4, 0x90
+.LBB28_13:                              # %while.body41
+                                        #   Parent Loop BB28_4 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movl	%r14d, %ecx
+	andl	$1, %ecx
+	movq	%r14, %rax
+	lock		cmpxchgq	%rcx, (%rbx)
+	je	.LBB28_17
+# %bb.14:                               # %if.end47
+                                        #   in Loop: Header=BB28_13 Depth=2
+	movq	%rax, %r14
+	cmpq	$4, %rax
+	jb	.LBB28_13
+# %bb.15:                               # %cleanup70
+                                        #   in Loop: Header=BB28_4 Depth=1
+	#MEMBARRIER
+	jmp	.LBB28_4
+	.p2align	4, 0x90
+.LBB28_10:                              # %if.then27
+                                        #   in Loop: Header=BB28_4 Depth=1
+	movq	%r14, %rcx
+	andq	$-3, %rcx
+	movq	%r14, %rax
+	lock		cmpxchgq	%rcx, (%rbx)
+	jne	.LBB28_11
+.LBB28_18:                              # %cleanup75
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB28_16:                              # %if.else62
+	movq	-48(%rbp), %rcx                 # 8-byte Reload
+	movq	%rax, 152(%rcx)
+	lock		andq	$-3, (%rbx)
+.LBB28_17:                              # %if.end66
+	movq	%r12, %rdi
+	callq	pthread_mutex_lock@PLT
+	movb	$0, 128(%r12)
+	leaq	64(%r12), %rdi
+	callq	pthread_cond_signal@PLT
+	movq	%r12, %rdi
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	jmp	pthread_mutex_unlock@PLT        # TAILCALL
+.Lfunc_end28:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv, .Lfunc_end28-_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv
+                                        # -- End function
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4, 0x0                          # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv
+.LCPI29_0:
+	.zero	16
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv,@function
+_ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv: # @_ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$168, %rsp
+	movq	%rdi, %rbx
+	movq	(%rdi), %r12
+	movl	$40, %r13d
+	leaq	-136(%rbp), %r14
+	leaq	-200(%rbp), %r15
+	jmp	.LBB29_1
+.LBB29_16:                              # %_ZN6Halide7Runtime8Internal15Synchronization13thread_parker4parkEv.exit
+                                        #   in Loop: Header=BB29_1 Depth=1
+	movq	%r15, %rdi
+	callq	pthread_mutex_unlock@PLT
+	movq	(%rbx), %r12
+	movl	$40, %r13d
+.LBB29_17:                              # %if.end22
+                                        #   in Loop: Header=BB29_1 Depth=1
+	movq	%r14, %rdi
+	callq	pthread_cond_destroy@PLT
+	movq	%r15, %rdi
+	callq	pthread_mutex_destroy@PLT
+	.p2align	4, 0x90
+.LBB29_1:                               # %while.cond.outer
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB29_14 Depth 2
+	testb	$1, %r12b
+	jne	.LBB29_4
+# %bb.2:                                # %if.then
+                                        #   in Loop: Header=BB29_1 Depth=1
+	movq	%r12, %rcx
+	orq	$1, %rcx
+	movq	%r12, %rax
+	lock		cmpxchgq	%rcx, (%rbx)
+	je	.LBB29_18
+# %bb.3:                                # %_ZN6Halide7Runtime8Internal15Synchronization12_GLOBAL__N_131atomic_cas_weak_acquire_relaxedEPyS4_S4_.exit
+                                        #   in Loop: Header=BB29_1 Depth=1
+	movq	%rax, %r12
+	jmp	.LBB29_1
+.LBB29_4:                               # %if.end4
+                                        #   in Loop: Header=BB29_1 Depth=1
+	cmpq	$4, %r12
+	jb	.LBB29_8
+# %bb.5:                                # %if.end4
+                                        #   in Loop: Header=BB29_1 Depth=1
+	testl	%r13d, %r13d
+	jle	.LBB29_8
+# %bb.6:                                # %_ZN6Halide7Runtime8Internal15Synchronization12spin_control11should_spinEv.exit
+                                        #   in Loop: Header=BB29_1 Depth=1
+	decl	%r13d
+	je	.LBB29_7
+# %bb.19:                               # %if.then7
+                                        #   in Loop: Header=BB29_1 Depth=1
+	callq	halide_thread_yield@PLT
+	movq	(%rbx), %r12
+	jmp	.LBB29_1
+.LBB29_7:                               #   in Loop: Header=BB29_1 Depth=1
+	xorl	%r13d, %r13d
+.LBB29_8:                               # %if.end9
+                                        #   in Loop: Header=BB29_1 Depth=1
+	movb	$0, -72(%rbp)
+	movq	%r15, %rdi
+	xorl	%esi, %esi
+	callq	pthread_mutex_init@PLT
+	movq	%r14, %rdi
+	xorl	%esi, %esi
+	callq	pthread_cond_init@PLT
+	vxorps	%xmm0, %xmm0, %xmm0
+	vmovups	%xmm0, -64(%rbp)
+	movq	$0, -48(%rbp)
+	movb	$1, -72(%rbp)
+	movq	%r12, %rax
+	andq	$-4, %rax
+	je	.LBB29_9
+# %bb.10:                               # %if.else
+                                        #   in Loop: Header=BB29_1 Depth=1
+	movq	%rax, -64(%rbp)
+	jmp	.LBB29_11
+.LBB29_9:                               # %if.then12
+                                        #   in Loop: Header=BB29_1 Depth=1
+	movq	%r15, -48(%rbp)
+.LBB29_11:                              # %if.end13
+                                        #   in Loop: Header=BB29_1 Depth=1
+	movl	%r12d, %ecx
+	andl	$3, %ecx
+	orq	%r15, %rcx
+	movq	%r12, %rax
+	lock		cmpxchgq	%rcx, (%rbx)
+	jne	.LBB29_12
+# %bb.13:                               # %if.then19
+                                        #   in Loop: Header=BB29_1 Depth=1
+	movq	%r15, %rdi
+	callq	pthread_mutex_lock@PLT
+	cmpb	$0, -72(%rbp)
+	je	.LBB29_16
+	.p2align	4, 0x90
+.LBB29_14:                              # %while.body.i
+                                        #   Parent Loop BB29_1 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	%r14, %rdi
+	movq	%r15, %rsi
+	callq	pthread_cond_wait@PLT
+	cmpb	$0, -72(%rbp)
+	jne	.LBB29_14
+	jmp	.LBB29_16
+.LBB29_12:                              # %_ZN6Halide7Runtime8Internal15Synchronization12_GLOBAL__N_131atomic_cas_weak_release_relaxedEPyS4_S4_.exit
+                                        #   in Loop: Header=BB29_1 Depth=1
+	movq	%rax, %r12
+	jmp	.LBB29_17
+.LBB29_18:                              # %cleanup23
+	addq	$168, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end29:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv, .Lfunc_end29-_ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control8validateERNS2_15validate_actionE,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control8validateERNS2_15validate_actionE,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control8validateERNS2_15validate_actionE # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control8validateERNS2_15validate_actionE
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control8validateERNS2_15validate_actionE,@function
+_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control8validateERNS2_15validate_actionE: # @_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control8validateERNS2_15validate_actionE
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	8(%rdi), %rax
+	movq	(%rax), %rax
+	cmpq	$3, %rax
+	sete	%al
+	popq	%rbp
+	retq
+.Lfunc_end30:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control8validateERNS2_15validate_actionE, .Lfunc_end30-_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control8validateERNS2_15validate_actionE
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv,@function
+_ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv: # @_ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	retq
+.Lfunc_end31:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv, .Lfunc_end31-_ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control6unparkEib,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control6unparkEib,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control6unparkEib # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control6unparkEib
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control6unparkEib,@function
+_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control6unparkEib: # @_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control6unparkEib
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movl	%edx, %eax
+	addq	%rax, %rax
+	movq	8(%rdi), %rcx
+	movq	%rax, (%rcx)
+	xorl	%eax, %eax
+	popq	%rbp
+	retq
+.Lfunc_end32:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control6unparkEib, .Lfunc_end32-_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control6unparkEib
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb,@function
+_ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb: # @_ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	retq
+.Lfunc_end33:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb, .Lfunc_end33-_ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb
+                                        # -- End function
+	.section	.text.halide_cond_broadcast,"ax",@progbits
+	.weak	halide_cond_broadcast           # -- Begin function halide_cond_broadcast
+	.p2align	4, 0x90
+	.type	halide_cond_broadcast,@function
+halide_cond_broadcast:                  # @halide_cond_broadcast
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$32, %rsp
+	movq	(%rdi), %rdx
+	testq	%rdx, %rdx
+	je	.LBB34_2
+# %bb.1:                                # %if.end.i
+	movq	%rdi, %rsi
+	movq	_ZTVN6Halide7Runtime8Internal15Synchronization25broadcast_parking_controlE@GOTPCREL(%rip), %rax
+	addq	$16, %rax
+	movq	%rax, -24(%rbp)
+	movq	%rdi, -16(%rbp)
+	movq	%rdx, -8(%rbp)
+	leaq	-24(%rbp), %rdi
+	xorl	%ecx, %ecx
+	callq	_ZN6Halide7Runtime8Internal15Synchronization15parking_control14unpark_requeueEyyy@PLT
+.LBB34_2:                               # %_ZN6Halide7Runtime8Internal15Synchronization9fast_cond9broadcastEv.exit
+	addq	$32, %rsp
+	popq	%rbp
+	retq
+.Lfunc_end34:
+	.size	halide_cond_broadcast, .Lfunc_end34-halide_cond_broadcast
+                                        # -- End function
+	.section	.text.halide_default_semaphore_try_acquire,"ax",@progbits
+	.weak	halide_default_semaphore_try_acquire # -- Begin function halide_default_semaphore_try_acquire
+	.p2align	4, 0x90
+	.type	halide_default_semaphore_try_acquire,@function
+halide_default_semaphore_try_acquire:   # @halide_default_semaphore_try_acquire
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	testl	%esi, %esi
+	je	.LBB35_1
+# %bb.2:                                # %if.end
+	movl	(%rdi), %eax
+	mfence
+	movl	(%rdi), %eax
+	movl	%eax, %edx
+	subl	%esi, %edx
+	js	.LBB35_3
+	.p2align	4, 0x90
+.LBB35_4:                               # %land.rhs
+                                        # =>This Inner Loop Header: Depth=1
+	lock		cmpxchgl	%edx, (%rdi)
+	sete	%cl
+	je	.LBB35_6
+# %bb.5:                                # %land.rhs
+                                        #   in Loop: Header=BB35_4 Depth=1
+	movl	%eax, %edx
+	subl	%esi, %edx
+	jns	.LBB35_4
+.LBB35_6:                               # %return
+	movl	%ecx, %eax
+	popq	%rbp
+	retq
+.LBB35_1:
+	movb	$1, %cl
+	movl	%ecx, %eax
+	popq	%rbp
+	retq
+.LBB35_3:
+	xorl	%ecx, %ecx
+	movl	%ecx, %eax
+	popq	%rbp
+	retq
+.Lfunc_end35:
+	.size	halide_default_semaphore_try_acquire, .Lfunc_end35-halide_default_semaphore_try_acquire
+                                        # -- End function
+	.section	.text.halide_cond_wait,"ax",@progbits
+	.weak	halide_cond_wait                # -- Begin function halide_cond_wait
+	.p2align	4, 0x90
+	.type	halide_cond_wait,@function
+halide_cond_wait:                       # @halide_cond_wait
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	subq	$48, %rsp
+	movq	%rsi, %rbx
+	movq	%rdi, %rsi
+	movq	_ZTVN6Halide7Runtime8Internal15Synchronization20wait_parking_controlE@GOTPCREL(%rip), %rax
+	addq	$16, %rax
+	movq	%rax, -72(%rbp)
+	movq	%rdi, -64(%rbp)
+	movq	%rbx, -56(%rbp)
+	leaq	-72(%rbp), %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization15parking_control4parkEy@PLT
+	cmpq	%rbx, %rax
+	jne	.LBB36_1
+# %bb.12:                               # %if.else.i
+	movq	(%rbx), %rax
+	testb	$1, %al
+	jne	.LBB36_14
+# %bb.13:                               # %if.then2.i
+	leaq	.L.str.5.6(%rip), %rsi
+	xorl	%edi, %edi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	jmp	.LBB36_14
+.LBB36_1:                               # %if.then.i
+	movl	$1, %ecx
+	xorl	%eax, %eax
+	lock		cmpxchgq	%rcx, (%rbx)
+	jne	.LBB36_2
+.LBB36_14:                              # %_ZN6Halide7Runtime8Internal15Synchronization9fast_cond4waitEPNS2_10fast_mutexE.exit
+	addq	$48, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB36_2:                               # %if.then.i.i
+	movq	(%rbx), %rax
+	movl	$40, %r12d
+	movq	_ZTVN6Halide7Runtime8Internal15Synchronization21mutex_parking_controlE@GOTPCREL(%rip), %r15
+	addq	$16, %r15
+	leaq	-48(%rbp), %r14
+	.p2align	4, 0x90
+.LBB36_3:                               # %while.cond.outer.i.i.i
+                                        # =>This Inner Loop Header: Depth=1
+	testb	$1, %al
+	jne	.LBB36_5
+# %bb.4:                                # %if.then.i.i.i
+                                        #   in Loop: Header=BB36_3 Depth=1
+	movq	%rax, %rcx
+	orq	$1, %rcx
+	lock		cmpxchgq	%rcx, (%rbx)
+	jne	.LBB36_3
+	jmp	.LBB36_14
+.LBB36_5:                               # %if.end4.i.i.i
+                                        #   in Loop: Header=BB36_3 Depth=1
+	testl	%r12d, %r12d
+	jg	.LBB36_6
+# %bb.8:                                # %if.end8.i.i.i
+                                        #   in Loop: Header=BB36_3 Depth=1
+	testb	$2, %al
+	jne	.LBB36_10
+.LBB36_9:                               # %if.then10.i.i.i
+                                        #   in Loop: Header=BB36_3 Depth=1
+	movq	%rax, %rcx
+	orq	$2, %rcx
+	lock		cmpxchgq	%rcx, (%rbx)
+	jne	.LBB36_3
+	jmp	.LBB36_10
+.LBB36_6:                               # %_ZN6Halide7Runtime8Internal15Synchronization12spin_control11should_spinEv.exit.i.i.i
+                                        #   in Loop: Header=BB36_3 Depth=1
+	decl	%r12d
+	je	.LBB36_7
+# %bb.15:                               # %if.then6.i.i.i
+                                        #   in Loop: Header=BB36_3 Depth=1
+	callq	halide_thread_yield@PLT
+	movq	(%rbx), %rax
+	jmp	.LBB36_3
+.LBB36_7:                               #   in Loop: Header=BB36_3 Depth=1
+	xorl	%r12d, %r12d
+	testb	$2, %al
+	je	.LBB36_9
+.LBB36_10:                              # %if.end19.i.i.i
+                                        #   in Loop: Header=BB36_3 Depth=1
+	movq	%r15, -48(%rbp)
+	movq	%rbx, -40(%rbp)
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization15parking_control4parkEy@PLT
+	cmpq	%rbx, %rax
+	je	.LBB36_14
+# %bb.11:                               # %if.end24.i.i.i
+                                        #   in Loop: Header=BB36_3 Depth=1
+	movq	(%rbx), %rax
+	movl	$40, %r12d
+	jmp	.LBB36_3
+.Lfunc_end36:
+	.size	halide_cond_wait, .Lfunc_end36-halide_cond_wait
+                                        # -- End function
+	.section	.text.halide_do_loop_task,"ax",@progbits
+	.weak	halide_do_loop_task             # -- Begin function halide_do_loop_task
+	.p2align	4, 0x90
+	.type	halide_do_loop_task,@function
+halide_do_loop_task:                    # @halide_do_loop_task
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal19custom_do_loop_taskE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end37:
+	.size	halide_do_loop_task, .Lfunc_end37-halide_do_loop_task
+                                        # -- End function
+	.section	.text.halide_do_task,"ax",@progbits
+	.weak	halide_do_task                  # -- Begin function halide_do_task
+	.p2align	4, 0x90
+	.type	halide_do_task,@function
+halide_do_task:                         # @halide_do_task
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal14custom_do_taskE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end38:
+	.size	halide_do_task, .Lfunc_end38-halide_do_task
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization15parking_control4parkEy,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization15parking_control4parkEy # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization15parking_control4parkEy
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization15parking_control4parkEy,@function
+_ZN6Halide7Runtime8Internal15Synchronization15parking_control4parkEy: # @_ZN6Halide7Runtime8Internal15Synchronization15parking_control4parkEy
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$184, %rsp
+	movq	%rsi, %r13
+	movq	%rdi, %r14
+	movb	$0, -88(%rbp)
+	leaq	-216(%rbp), %r12
+	movq	%r12, %rdi
+	xorl	%esi, %esi
+	callq	pthread_mutex_init@PLT
+	leaq	-152(%rbp), %rbx
+	movq	%rbx, %rdi
+	xorl	%esi, %esi
+	callq	pthread_cond_init@PLT
+	vxorps	%xmm0, %xmm0, %xmm0
+	vmovups	%xmm0, -80(%rbp)
+	movq	$0, -64(%rbp)
+	movq	%r13, %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization11lock_bucketEy@PLT
+	movq	%rax, %r15
+	movb	$0, -56(%rbp)
+	movq	$0, -48(%rbp)
+	movq	(%r14), %rax
+	leaq	-56(%rbp), %rsi
+	movq	%r14, %rdi
+	callq	*(%rax)
+	testb	%al, %al
+	je	.LBB39_1
+# %bb.6:                                # %if.end
+	movq	$0, -72(%rbp)
+	movq	%r13, -80(%rbp)
+	movb	$1, -88(%rbp)
+	movq	%r15, %rax
+	addq	$8, %rax
+	movl	$144, %ecx
+	addq	16(%r15), %rcx
+	cmpq	$0, 8(%r15)
+	cmoveq	%rax, %rcx
+	leaq	-64(%rbp), %r13
+	movq	%r12, (%rcx)
+	movq	%r12, 16(%r15)
+	movq	(%r15), %rax
+	.p2align	4, 0x90
+.LBB39_7:                               # %atomicrmw.start2
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rax, %rcx
+	andq	$-2, %rcx
+	lock		cmpxchgq	%rcx, (%r15)
+	jne	.LBB39_7
+# %bb.8:                                # %atomicrmw.end1
+	cmpq	$4, %rax
+	jb	.LBB39_11
+# %bb.9:                                # %atomicrmw.end1
+	andl	$2, %eax
+	jne	.LBB39_11
+# %bb.10:                               # %if.then.i25
+	movq	%r15, %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv@PLT
+.LBB39_11:                              # %_ZN6Halide7Runtime8Internal15Synchronization9word_lock6unlockEv.exit26
+	movq	(%r14), %rax
+	movq	%r14, %rdi
+	callq	*8(%rax)
+	leaq	-216(%rbp), %rdi
+	callq	pthread_mutex_lock@PLT
+	cmpb	$0, -88(%rbp)
+	je	.LBB39_14
+# %bb.12:                               # %while.body.i.preheader
+	leaq	-216(%rbp), %r14
+	.p2align	4, 0x90
+.LBB39_13:                              # %while.body.i
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	pthread_cond_wait@PLT
+	cmpb	$0, -88(%rbp)
+	jne	.LBB39_13
+.LBB39_14:                              # %_ZN6Halide7Runtime8Internal15Synchronization13thread_parker4parkEv.exit
+	leaq	-216(%rbp), %rdi
+	callq	pthread_mutex_unlock@PLT
+	jmp	.LBB39_15
+.LBB39_1:                               # %if.then
+	leaq	-48(%rbp), %r13
+	movq	(%r15), %rax
+	.p2align	4, 0x90
+.LBB39_2:                               # %atomicrmw.start
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rax, %rcx
+	andq	$-2, %rcx
+	lock		cmpxchgq	%rcx, (%r15)
+	jne	.LBB39_2
+# %bb.3:                                # %atomicrmw.end
+	cmpq	$4, %rax
+	jb	.LBB39_15
+# %bb.4:                                # %atomicrmw.end
+	andl	$2, %eax
+	jne	.LBB39_15
+# %bb.5:                                # %if.then.i
+	movq	%r15, %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv@PLT
+.LBB39_15:                              # %cleanup
+	movq	(%r13), %r14
+	movq	%rbx, %rdi
+	callq	pthread_cond_destroy@PLT
+	leaq	-216(%rbp), %rdi
+	callq	pthread_mutex_destroy@PLT
+	movq	%r14, %rax
+	addq	$184, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end39:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization15parking_control4parkEy, .Lfunc_end39-_ZN6Halide7Runtime8Internal15Synchronization15parking_control4parkEy
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control8validateERNS2_15validate_actionE,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control8validateERNS2_15validate_actionE,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control8validateERNS2_15validate_actionE # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control8validateERNS2_15validate_actionE
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control8validateERNS2_15validate_actionE,@function
+_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control8validateERNS2_15validate_actionE: # @_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control8validateERNS2_15validate_actionE
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	8(%rdi), %rax
+	movq	(%rax), %rdx
+	movq	16(%rdi), %rcx
+	testq	%rdx, %rdx
+	je	.LBB40_1
+# %bb.2:                                # %if.else
+	movb	$1, %al
+	cmpq	%rcx, %rdx
+	je	.LBB40_4
+# %bb.3:                                # %if.then5
+	movq	%rcx, 8(%rsi)
+	xorl	%eax, %eax
+.LBB40_4:                               # %cleanup
+                                        # kill: def $al killed $al killed $eax
+	popq	%rbp
+	retq
+.LBB40_1:                               # %if.then
+	movq	%rcx, (%rax)
+	movb	$1, %al
+                                        # kill: def $al killed $al killed $eax
+	popq	%rbp
+	retq
+.Lfunc_end40:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control8validateERNS2_15validate_actionE, .Lfunc_end40-_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control8validateERNS2_15validate_actionE
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control12before_sleepEv,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control12before_sleepEv,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control12before_sleepEv # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control12before_sleepEv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control12before_sleepEv,@function
+_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control12before_sleepEv: # @_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control12before_sleepEv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$16, %rsp
+	movq	16(%rdi), %rsi
+	xorl	%ecx, %ecx
+	movl	$1, %eax
+	lock		cmpxchgq	%rcx, (%rsi)
+	je	.LBB41_3
+# %bb.1:                                # %if.then.i
+	xorl	%ecx, %ecx
+	movl	$1, %eax
+	lock		cmpxchgq	%rcx, (%rsi)
+	je	.LBB41_3
+# %bb.2:                                # %if.end.i.i
+	movq	_ZTVN6Halide7Runtime8Internal15Synchronization21mutex_parking_controlE@GOTPCREL(%rip), %rax
+	addq	$16, %rax
+	movq	%rax, -16(%rbp)
+	movq	%rsi, -8(%rbp)
+	leaq	-16(%rbp), %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization15parking_control10unpark_oneEy@PLT
+.LBB41_3:                               # %_ZN6Halide7Runtime8Internal15Synchronization10fast_mutex6unlockEv.exit
+	addq	$16, %rsp
+	popq	%rbp
+	retq
+.Lfunc_end41:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control12before_sleepEv, .Lfunc_end41-_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control12before_sleepEv
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control6unparkEib,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control6unparkEib,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control6unparkEib # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control6unparkEib
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control6unparkEib,@function
+_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control6unparkEib: # @_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control6unparkEib
+# %bb.0:                                # %entry
+	testl	%edx, %edx
+	jne	.LBB42_2
+# %bb.1:                                # %if.then
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	8(%rdi), %rax
+	movq	$0, (%rax)
+	popq	%rbp
+.LBB42_2:                               # %if.end
+	xorl	%eax, %eax
+	retq
+.Lfunc_end42:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control6unparkEib, .Lfunc_end42-_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control6unparkEib
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization15parking_control14unpark_requeueEyyy,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization15parking_control14unpark_requeueEyyy # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization15parking_control14unpark_requeueEyyy
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization15parking_control14unpark_requeueEyyy,@function
+_ZN6Halide7Runtime8Internal15Synchronization15parking_control14unpark_requeueEyyy: # @_ZN6Halide7Runtime8Internal15Synchronization15parking_control14unpark_requeueEyyy
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$40, %rsp
+	movq	%rcx, %rbx
+	movq	%rdx, %r12
+	movq	%rsi, %r13
+	movq	%rdi, %r14
+	leaq	-56(%rbp), %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization16lock_bucket_pairEyy@PLT
+	movb	$0, -72(%rbp)
+	movq	$0, -64(%rbp)
+	movq	(%r14), %rax
+	leaq	-72(%rbp), %rsi
+	movq	%r14, %rdi
+	callq	*(%rax)
+	testb	%al, %al
+	je	.LBB43_1
+# %bb.2:                                # %if.end
+	movq	-56(%rbp), %rdx
+	movq	8(%rdx), %rsi
+	testq	%rsi, %rsi
+	je	.LBB43_3
+# %bb.4:                                # %while.body.preheader
+	addq	$8, %rdx
+	xorl	%r15d, %r15d
+	xorl	%ecx, %ecx
+	xorl	%eax, %eax
+	xorl	%edi, %edi
+	jmp	.LBB43_5
+	.p2align	4, 0x90
+.LBB43_6:                               #   in Loop: Header=BB43_5 Depth=1
+	leaq	144(%r8), %rdx
+	movq	%r8, %rdi
+.LBB43_15:                              # %if.end22
+                                        #   in Loop: Header=BB43_5 Depth=1
+	testq	%rsi, %rsi
+	je	.LBB43_16
+.LBB43_5:                               # %while.body
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rsi, %r8
+	movq	136(%rsi), %r9
+	movq	144(%rsi), %rsi
+	cmpq	%r13, %r9
+	jne	.LBB43_6
+# %bb.7:                                # %if.then4
+                                        #   in Loop: Header=BB43_5 Depth=1
+	movq	%rsi, (%rdx)
+	movq	-56(%rbp), %r9
+	cmpq	%r8, 16(%r9)
+	je	.LBB43_8
+# %bb.9:                                # %if.end10
+                                        #   in Loop: Header=BB43_5 Depth=1
+	cmpb	$0, -72(%rbp)
+	jne	.LBB43_10
+	jmp	.LBB43_12
+.LBB43_8:                               # %if.then7
+                                        #   in Loop: Header=BB43_5 Depth=1
+	movq	%rdi, 16(%r9)
+	cmpb	$0, -72(%rbp)
+	je	.LBB43_12
+.LBB43_10:                              # %if.end10
+                                        #   in Loop: Header=BB43_5 Depth=1
+	testq	%r15, %r15
+	jne	.LBB43_12
+# %bb.11:                               #   in Loop: Header=BB43_5 Depth=1
+	movq	%r8, %r15
+	jmp	.LBB43_15
+	.p2align	4, 0x90
+.LBB43_12:                              # %if.else
+                                        #   in Loop: Header=BB43_5 Depth=1
+	movq	%r8, %r9
+	testq	%rax, %rax
+	je	.LBB43_14
+# %bb.13:                               # %if.else15
+                                        #   in Loop: Header=BB43_5 Depth=1
+	movq	%r8, 144(%rcx)
+	movq	%rax, %r9
+.LBB43_14:                              # %if.end17
+                                        #   in Loop: Header=BB43_5 Depth=1
+	movq	%r12, 136(%r8)
+	movq	%r9, %rax
+	movq	%r8, %rcx
+	jmp	.LBB43_15
+.LBB43_1:                               # %if.then
+	leaq	-56(%rbp), %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization18unlock_bucket_pairERNS2_11bucket_pairE@PLT
+	xorl	%eax, %eax
+	jmp	.LBB43_26
+.LBB43_16:                              # %while.end
+	testq	%rax, %rax
+	je	.LBB43_17
+# %bb.18:                               # %if.then24
+	movq	$0, 144(%rcx)
+	movq	-48(%rbp), %rdx
+	cmpq	$0, 8(%rdx)
+	je	.LBB43_19
+# %bb.20:                               # %if.else31
+	movl	$144, %esi
+	addq	16(%rdx), %rsi
+	jmp	.LBB43_21
+.LBB43_3:
+	xorl	%eax, %eax
+	xorl	%r15d, %r15d
+	jmp	.LBB43_22
+.LBB43_17:
+	xorl	%eax, %eax
+	jmp	.LBB43_22
+.LBB43_19:
+	leaq	8(%rdx), %rsi
+.LBB43_21:                              # %if.end35
+	movq	%rax, (%rsi)
+	movq	%rcx, 16(%rdx)
+	movb	$1, %al
+.LBB43_22:                              # %if.end38
+	xorl	%edx, %edx
+	testq	%r15, %r15
+	setne	%dl
+	movq	(%r14), %r8
+	movzbl	%al, %ecx
+	leaq	-72(%rbp), %rsi
+	movq	%r14, %rdi
+	callq	*24(%r8)
+	testq	%r15, %r15
+	je	.LBB43_24
+# %bb.23:                               # %if.then44
+	movq	%rbx, 152(%r15)
+	movq	%r15, %rdi
+	callq	pthread_mutex_lock@PLT
+	leaq	-56(%rbp), %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization18unlock_bucket_pairERNS2_11bucket_pairE@PLT
+	movb	$0, 128(%r15)
+	leaq	64(%r15), %rdi
+	callq	pthread_cond_signal@PLT
+	movq	%r15, %rdi
+	callq	pthread_mutex_unlock@PLT
+	jmp	.LBB43_25
+.LBB43_24:                              # %if.else48
+	leaq	-56(%rbp), %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization18unlock_bucket_pairERNS2_11bucket_pairE@PLT
+.LBB43_25:                              # %if.end49
+	testq	%r15, %r15
+	setne	%al
+	andb	-72(%rbp), %al
+	movzbl	%al, %eax
+.LBB43_26:                              # %cleanup
+	addq	$40, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end43:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization15parking_control14unpark_requeueEyyy, .Lfunc_end43-_ZN6Halide7Runtime8Internal15Synchronization15parking_control14unpark_requeueEyyy
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization16lock_bucket_pairEyy,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization16lock_bucket_pairEyy # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization16lock_bucket_pairEyy
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization16lock_bucket_pairEyy,@function
+_ZN6Halide7Runtime8Internal15Synchronization16lock_bucket_pairEyy: # @_ZN6Halide7Runtime8Internal15Synchronization16lock_bucket_pairEyy
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movabsq	$-7046029254386353131, %rax     # imm = 0x9E3779B97F4A7C15
+	imulq	%rax, %rsi
+	shrq	$54, %rsi
+	imulq	%rax, %rdx
+	shrq	$54, %rdx
+	cmpq	%rdx, %rsi
+	jne	.LBB44_2
+# %bb.1:                                # %if.then
+	leaq	(%rsi,%rsi,2), %rcx
+	movq	_ZN6Halide7Runtime8Internal15Synchronization5tableE@GOTPCREL(%rip), %rdx
+	leaq	(%rdx,%rcx,8), %r14
+	movl	$1, %esi
+	xorl	%eax, %eax
+	lock		cmpxchgq	%rsi, (%rdx,%rcx,8)
+	movq	%r14, %r15
+	movq	%r14, %r12
+	je	.LBB44_11
+.LBB44_10:                              # %cleanup.sink.split
+	movq	%r14, %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv@PLT
+.LBB44_11:                              # %cleanup
+	movq	%r15, (%rbx)
+	movq	%r12, 8(%rbx)
+	movq	%rbx, %rax
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB44_2:                               # %if.else
+	jae	.LBB44_7
+# %bb.3:                                # %if.then3
+	leaq	(%rsi,%rsi,2), %rcx
+	movq	_ZN6Halide7Runtime8Internal15Synchronization5tableE@GOTPCREL(%rip), %rsi
+	leaq	(%rsi,%rcx,8), %r15
+	leaq	(%rdx,%rdx,2), %rax
+	leaq	(%rsi,%rax,8), %r14
+	movl	$1, %r12d
+	xorl	%eax, %eax
+	lock		cmpxchgq	%r12, (%rsi,%rcx,8)
+	je	.LBB44_5
+# %bb.4:                                # %if.then.i32
+	movq	%r15, %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv@PLT
+.LBB44_5:                               # %_ZN6Halide7Runtime8Internal15Synchronization9word_lock4lockEv.exit33
+	xorl	%eax, %eax
+	lock		cmpxchgq	%r12, (%r14)
+	movq	%r14, %r12
+	jne	.LBB44_10
+	jmp	.LBB44_11
+.LBB44_7:                               # %if.else9
+	leaq	(%rdx,%rdx,2), %rcx
+	movq	_ZN6Halide7Runtime8Internal15Synchronization5tableE@GOTPCREL(%rip), %rdx
+	leaq	(%rdx,%rcx,8), %r12
+	leaq	(%rsi,%rsi,2), %rax
+	leaq	(%rdx,%rax,8), %r14
+	movl	$1, %r15d
+	xorl	%eax, %eax
+	lock		cmpxchgq	%r15, (%rdx,%rcx,8)
+	je	.LBB44_9
+# %bb.8:                                # %if.then.i37
+	movq	%r12, %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization9word_lock9lock_fullEv@PLT
+.LBB44_9:                               # %_ZN6Halide7Runtime8Internal15Synchronization9word_lock4lockEv.exit38
+	xorl	%eax, %eax
+	lock		cmpxchgq	%r15, (%r14)
+	movq	%r14, %r15
+	jne	.LBB44_10
+	jmp	.LBB44_11
+.Lfunc_end44:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization16lock_bucket_pairEyy, .Lfunc_end44-_ZN6Halide7Runtime8Internal15Synchronization16lock_bucket_pairEyy
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization18unlock_bucket_pairERNS2_11bucket_pairE,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization18unlock_bucket_pairERNS2_11bucket_pairE # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization18unlock_bucket_pairERNS2_11bucket_pairE
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization18unlock_bucket_pairERNS2_11bucket_pairE,@function
+_ZN6Halide7Runtime8Internal15Synchronization18unlock_bucket_pairERNS2_11bucket_pairE: # @_ZN6Halide7Runtime8Internal15Synchronization18unlock_bucket_pairERNS2_11bucket_pairE
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%rbx
+	pushq	%rax
+	movq	%rdi, %rbx
+	movq	(%rdi), %rdi
+	movq	8(%rbx), %rcx
+	cmpq	%rcx, %rdi
+	je	.LBB45_1
+# %bb.3:                                # %if.else
+	jbe	.LBB45_11
+# %bb.4:                                # %if.then5
+	movq	(%rdi), %rax
+	.p2align	4, 0x90
+.LBB45_5:                               # %atomicrmw.start2
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rax, %rcx
+	andq	$-2, %rcx
+	lock		cmpxchgq	%rcx, (%rdi)
+	jne	.LBB45_5
+# %bb.6:                                # %atomicrmw.end1
+	cmpq	$4, %rax
+	jb	.LBB45_9
+# %bb.7:                                # %atomicrmw.end1
+	andl	$2, %eax
+	jne	.LBB45_9
+# %bb.8:                                # %if.then.i29
+	callq	_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv@PLT
+.LBB45_9:                               # %_ZN6Halide7Runtime8Internal15Synchronization9word_lock6unlockEv.exit30
+	movq	8(%rbx), %rdi
+	movq	(%rdi), %rax
+	.p2align	4, 0x90
+.LBB45_10:                              # %atomicrmw.start8
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rax, %rcx
+	andq	$-2, %rcx
+	lock		cmpxchgq	%rcx, (%rdi)
+	jne	.LBB45_10
+	jmp	.LBB45_18
+.LBB45_1:                               # %if.then
+	movq	(%rdi), %rax
+	.p2align	4, 0x90
+.LBB45_2:                               # %atomicrmw.start
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rax, %rcx
+	andq	$-2, %rcx
+	lock		cmpxchgq	%rcx, (%rdi)
+	jne	.LBB45_2
+	jmp	.LBB45_18
+.LBB45_11:                              # %if.else10
+	movq	(%rcx), %rax
+	.p2align	4, 0x90
+.LBB45_12:                              # %atomicrmw.start14
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rax, %rdx
+	andq	$-2, %rdx
+	lock		cmpxchgq	%rdx, (%rcx)
+	jne	.LBB45_12
+# %bb.13:                               # %atomicrmw.end13
+	cmpq	$4, %rax
+	jb	.LBB45_16
+# %bb.14:                               # %atomicrmw.end13
+	andl	$2, %eax
+	jne	.LBB45_16
+# %bb.15:                               # %if.then.i41
+	movq	%rcx, %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv@PLT
+.LBB45_16:                              # %_ZN6Halide7Runtime8Internal15Synchronization9word_lock6unlockEv.exit42
+	movq	(%rbx), %rdi
+	movq	(%rdi), %rax
+	.p2align	4, 0x90
+.LBB45_17:                              # %atomicrmw.start20
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rax, %rcx
+	andq	$-2, %rcx
+	lock		cmpxchgq	%rcx, (%rdi)
+	jne	.LBB45_17
+.LBB45_18:                              # %atomicrmw.end19
+	cmpq	$4, %rax
+	jb	.LBB45_20
+# %bb.19:                               # %atomicrmw.end19
+	andl	$2, %eax
+	jne	.LBB45_20
+# %bb.21:                               # %if.end15.sink.split
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%rbp
+	jmp	_ZN6Halide7Runtime8Internal15Synchronization9word_lock11unlock_fullEv@PLT # TAILCALL
+.LBB45_20:                              # %if.end15
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%rbp
+	retq
+.Lfunc_end45:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization18unlock_bucket_pairERNS2_11bucket_pairE, .Lfunc_end45-_ZN6Halide7Runtime8Internal15Synchronization18unlock_bucket_pairERNS2_11bucket_pairE
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control8validateERNS2_15validate_actionE,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control8validateERNS2_15validate_actionE,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control8validateERNS2_15validate_actionE # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control8validateERNS2_15validate_actionE
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control8validateERNS2_15validate_actionE,@function
+_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control8validateERNS2_15validate_actionE: # @_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control8validateERNS2_15validate_actionE
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	8(%rdi), %rax
+	movq	(%rax), %rcx
+	movq	16(%rdi), %rdx
+	cmpq	%rdx, %rcx
+	jne	.LBB46_6
+# %bb.1:                                # %if.end
+	movq	$0, (%rax)
+	movq	16(%rdi), %r8
+	movq	(%r8), %rax
+	movb	$1, %dil
+	.p2align	4, 0x90
+.LBB46_2:                               # %if.end
+                                        # =>This Inner Loop Header: Depth=1
+	testb	$1, %al
+	je	.LBB46_5
+# %bb.3:                                # %if.end.i
+                                        #   in Loop: Header=BB46_2 Depth=1
+	movq	%rax, %r9
+	orq	$2, %r9
+	lock		cmpxchgq	%r9, (%r8)
+	jne	.LBB46_2
+# %bb.4:
+	xorl	%edi, %edi
+.LBB46_5:                               # %_ZN6Halide7Runtime8Internal15Synchronization10fast_mutex21make_parked_if_lockedEv.exit
+	movb	%dil, (%rsi)
+.LBB46_6:                               # %cleanup
+	cmpq	%rdx, %rcx
+	sete	%al
+	popq	%rbp
+	retq
+.Lfunc_end46:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control8validateERNS2_15validate_actionE, .Lfunc_end46-_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control8validateERNS2_15validate_actionE
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization15parking_control6unparkEib,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization15parking_control6unparkEib,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization15parking_control6unparkEib # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization15parking_control6unparkEib
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization15parking_control6unparkEib,@function
+_ZN6Halide7Runtime8Internal15Synchronization15parking_control6unparkEib: # @_ZN6Halide7Runtime8Internal15Synchronization15parking_control6unparkEib
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	xorl	%eax, %eax
+	popq	%rbp
+	retq
+.Lfunc_end47:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization15parking_control6unparkEib, .Lfunc_end47-_ZN6Halide7Runtime8Internal15Synchronization15parking_control6unparkEib
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control16requeue_callbackERKNS2_15validate_actionEbb,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control16requeue_callbackERKNS2_15validate_actionEbb,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control16requeue_callbackERKNS2_15validate_actionEbb # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control16requeue_callbackERKNS2_15validate_actionEbb
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control16requeue_callbackERKNS2_15validate_actionEbb,@function
+_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control16requeue_callbackERKNS2_15validate_actionEbb: # @_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control16requeue_callbackERKNS2_15validate_actionEbb
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	cmpb	$0, (%rsi)
+	je	.LBB48_3
+# %bb.1:                                # %entry
+	testb	%cl, %cl
+	je	.LBB48_3
+# %bb.2:                                # %if.then
+	movq	16(%rdi), %rax
+	lock		orq	$2, (%rax)
+.LBB48_3:                               # %if.end
+	popq	%rbp
+	retq
+.Lfunc_end48:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control16requeue_callbackERKNS2_15validate_actionEbb, .Lfunc_end48-_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control16requeue_callbackERKNS2_15validate_actionEbb
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal27default_desired_num_threadsEv,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal27default_desired_num_threadsEv # -- Begin function _ZN6Halide7Runtime8Internal27default_desired_num_threadsEv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal27default_desired_num_threadsEv,@function
+_ZN6Halide7Runtime8Internal27default_desired_num_threadsEv: # @_ZN6Halide7Runtime8Internal27default_desired_num_threadsEv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	leaq	.L.str.1(%rip), %rdi
+	callq	getenv@PLT
+	testq	%rax, %rax
+	jne	.LBB49_2
+# %bb.1:                                # %if.end
+	leaq	.L.str.2(%rip), %rdi
+	callq	getenv@PLT
+	testq	%rax, %rax
+	je	.LBB49_3
+.LBB49_2:                               # %cond.true
+	movq	%rax, %rdi
+	popq	%rbp
+	jmp	atoi@PLT                        # TAILCALL
+.LBB49_3:                               # %cond.false
+	popq	%rbp
+	jmp	halide_host_cpu_count@PLT       # TAILCALL
+.Lfunc_end49:
+	.size	_ZN6Halide7Runtime8Internal27default_desired_num_threadsEv, .Lfunc_end49-_ZN6Halide7Runtime8Internal27default_desired_num_threadsEv
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal13worker_threadEPv,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal13worker_threadEPv # -- Begin function _ZN6Halide7Runtime8Internal13worker_threadEPv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal13worker_threadEPv,@function
+_ZN6Halide7Runtime8Internal13worker_threadEPv: # @_ZN6Halide7Runtime8Internal13worker_threadEPv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %r14
+	movq	%r14, %rdi
+	callq	halide_mutex_lock@PLT
+	movq	%rbx, %rdi
+	callq	_ZN6Halide7Runtime8Internal28worker_thread_already_lockedEPNS1_4workE@PLT
+	movq	%r14, %rdi
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	jmp	halide_mutex_unlock@PLT         # TAILCALL
+.Lfunc_end50:
+	.size	_ZN6Halide7Runtime8Internal13worker_threadEPv, .Lfunc_end50-_ZN6Halide7Runtime8Internal13worker_threadEPv
+                                        # -- End function
+	.section	.text.halide_spawn_thread,"ax",@progbits
+	.weak	halide_spawn_thread             # -- Begin function halide_spawn_thread
+	.p2align	4, 0x90
+	.type	halide_spawn_thread,@function
+halide_spawn_thread:                    # @halide_spawn_thread
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movq	%rsi, %rbx
+	movq	%rdi, %r14
+	movl	$24, %edi
+	callq	malloc@PLT
+	movq	%rax, %r15
+	movq	%r14, (%rax)
+	movq	%rbx, 8(%rax)
+	leaq	16(%rax), %rdi
+	movq	$0, 16(%rax)
+	movq	_ZN6Halide7Runtime8Internal19spawn_thread_helperEPv@GOTPCREL(%rip), %rdx
+	xorl	%esi, %esi
+	movq	%rax, %rcx
+	callq	pthread_create@PLT
+	movq	%r15, %rax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end51:
+	.size	halide_spawn_thread, .Lfunc_end51-halide_spawn_thread
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal19spawn_thread_helperEPv,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal19spawn_thread_helperEPv # -- Begin function _ZN6Halide7Runtime8Internal19spawn_thread_helperEPv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal19spawn_thread_helperEPv,@function
+_ZN6Halide7Runtime8Internal19spawn_thread_helperEPv: # @_ZN6Halide7Runtime8Internal19spawn_thread_helperEPv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	%rdi, %rax
+	movq	8(%rdi), %rdi
+	callq	*(%rax)
+	xorl	%eax, %eax
+	popq	%rbp
+	retq
+.Lfunc_end52:
+	.size	_ZN6Halide7Runtime8Internal19spawn_thread_helperEPv, .Lfunc_end52-_ZN6Halide7Runtime8Internal19spawn_thread_helperEPv
+                                        # -- End function
+	.section	.text.halide_default_do_parallel_tasks,"ax",@progbits
+	.weak	halide_default_do_parallel_tasks # -- Begin function halide_default_do_parallel_tasks
+	.p2align	4, 0x90
+	.type	halide_default_do_parallel_tasks,@function
+halide_default_do_parallel_tasks:       # @halide_default_do_parallel_tasks
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, %r15
+	movl	%esi, %r14d
+	movslq	%esi, %rax
+	movq	%rax, %rcx
+	shlq	$7, %rcx
+	movq	%rsp, %rbx
+	subq	%rcx, %rbx
+	movq	%rbx, %rsp
+	testl	%eax, %eax
+	jle	.LBB53_4
+# %bb.1:                                # %for.body.preheader
+	leaq	124(%rbx), %rax
+	xorl	%ecx, %ecx
+	jmp	.LBB53_2
+	.p2align	4, 0x90
+.LBB53_6:                               # %if.end
+                                        #   in Loop: Header=BB53_2 Depth=1
+	vmovups	(%rdx), %ymm0
+	vmovups	24(%rdx), %ymm1
+	addq	$56, %rdx
+	vmovups	%ymm1, -100(%rax)
+	vmovups	%ymm0, -124(%rax)
+	movq	$0, -68(%rax)
+	movq	%rdi, -20(%rax)
+	movq	$0, -12(%rax)
+	movl	$0, -4(%rax)
+	movb	$0, (%rax)
+	movq	%r15, -36(%rax)
+.LBB53_7:                               # %for.inc
+                                        #   in Loop: Header=BB53_2 Depth=1
+	incq	%rcx
+	movslq	%r14d, %rsi
+	subq	$-128, %rax
+	cmpq	%rsi, %rcx
+	jge	.LBB53_4
+.LBB53_2:                               # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	cmpl	$0, 40(%rdx)
+	jg	.LBB53_6
+# %bb.3:                                # %if.then
+                                        #   in Loop: Header=BB53_2 Depth=1
+	decl	%r14d
+	jmp	.LBB53_7
+.LBB53_4:                               # %for.cond.cleanup
+	testl	%r14d, %r14d
+	je	.LBB53_5
+# %bb.8:                                # %if.end19
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rdi
+	vzeroupper
+	callq	halide_mutex_lock@PLT
+	movl	%r14d, %edi
+	movq	%rbx, %rsi
+	movq	%r15, %rdx
+	callq	_ZN6Halide7Runtime8Internal27enqueue_work_already_lockedEiPNS1_4workES3_@PLT
+	testl	%r14d, %r14d
+	jle	.LBB53_9
+# %bb.12:                               # %for.body25.preheader
+	movl	%r14d, %r15d
+	xorl	%r14d, %r14d
+	.p2align	4, 0x90
+.LBB53_13:                              # %for.body25
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rbx, %rdi
+	callq	_ZN6Halide7Runtime8Internal28worker_thread_already_lockedEPNS1_4workE@PLT
+	movl	116(%rbx), %eax
+	testl	%eax, %eax
+	cmovnel	%eax, %r14d
+	subq	$-128, %rbx
+	decq	%r15
+	jne	.LBB53_13
+	jmp	.LBB53_10
+.LBB53_5:
+	xorl	%r14d, %r14d
+	jmp	.LBB53_11
+.LBB53_9:
+	xorl	%r14d, %r14d
+.LBB53_10:                              # %for.cond.cleanup24
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_unlock@PLT
+.LBB53_11:                              # %cleanup
+	movl	%r14d, %eax
+	leaq	-24(%rbp), %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	vzeroupper
+	retq
+.Lfunc_end53:
+	.size	halide_default_do_parallel_tasks, .Lfunc_end53-halide_default_do_parallel_tasks
+                                        # -- End function
+	.section	.text.halide_default_semaphore_init,"ax",@progbits
+	.weak	halide_default_semaphore_init   # -- Begin function halide_default_semaphore_init
+	.p2align	4, 0x90
+	.type	halide_default_semaphore_init,@function
+halide_default_semaphore_init:          # @halide_default_semaphore_init
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movl	%esi, %eax
+	movl	%esi, (%rdi)
+	popq	%rbp
+	retq
+.Lfunc_end54:
+	.size	halide_default_semaphore_init, .Lfunc_end54-halide_default_semaphore_init
+                                        # -- End function
+	.section	.text.halide_default_semaphore_release,"ax",@progbits
+	.weak	halide_default_semaphore_release # -- Begin function halide_default_semaphore_release
+	.p2align	4, 0x90
+	.type	halide_default_semaphore_release,@function
+halide_default_semaphore_release:       # @halide_default_semaphore_release
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movl	%esi, %ebx
+	movl	%esi, %r14d
+	lock		xaddl	%r14d, (%rdi)
+	testl	%esi, %esi
+	je	.LBB55_3
+# %bb.1:                                # %entry
+	testl	%r14d, %r14d
+	jne	.LBB55_3
+# %bb.2:                                # %if.then
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %r15
+	movq	%r15, %rdi
+	callq	halide_mutex_lock@PLT
+	leaq	40(%r15), %rdi
+	callq	halide_cond_broadcast@PLT
+	leaq	56(%r15), %rdi
+	callq	halide_cond_broadcast@PLT
+	movq	%r15, %rdi
+	callq	halide_mutex_unlock@PLT
+.LBB55_3:                               # %if.end
+	addl	%ebx, %r14d
+	movl	%r14d, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end55:
+	.size	halide_default_semaphore_release, .Lfunc_end55-halide_default_semaphore_release
+                                        # -- End function
+	.section	.text.halide_thread_pool_cleanup,"ax",@progbits
+	.weak	halide_thread_pool_cleanup      # -- Begin function halide_thread_pool_cleanup
+	.p2align	4, 0x90
+	.type	halide_thread_pool_cleanup,@function
+halide_thread_pool_cleanup:             # @halide_thread_pool_cleanup
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	jmp	halide_shutdown_thread_pool@PLT # TAILCALL
+.Lfunc_end56:
+	.size	halide_thread_pool_cleanup, .Lfunc_end56-halide_thread_pool_cleanup
+                                        # -- End function
+	.section	.text.halide_shutdown_thread_pool,"ax",@progbits
+	.weak	halide_shutdown_thread_pool     # -- Begin function halide_shutdown_thread_pool
+	.p2align	4, 0x90
+	.type	halide_shutdown_thread_pool,@function
+halide_shutdown_thread_pool:            # @halide_shutdown_thread_pool
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rbx
+	cmpb	$0, 2121(%rbx)
+	je	.LBB57_5
+# %bb.1:                                # %if.then
+	movq	%rbx, %rdi
+	callq	halide_mutex_lock@PLT
+	movb	$1, 2120(%rbx)
+	leaq	56(%rbx), %rdi
+	callq	halide_cond_broadcast@PLT
+	leaq	40(%rbx), %rdi
+	callq	halide_cond_broadcast@PLT
+	leaq	48(%rbx), %rdi
+	callq	halide_cond_broadcast@PLT
+	movq	%rbx, %rdi
+	callq	halide_mutex_unlock@PLT
+	cmpl	$0, 24(%rbx)
+	jle	.LBB57_4
+# %bb.2:                                # %for.body.preheader
+	xorl	%r14d, %r14d
+	.p2align	4, 0x90
+.LBB57_3:                               # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	movq	72(%rbx,%r14,8), %rdi
+	callq	halide_join_thread@PLT
+	incq	%r14
+	movslq	24(%rbx), %rax
+	cmpq	%rax, %r14
+	jl	.LBB57_3
+.LBB57_4:                               # %for.cond.cleanup
+	addq	$12, %rbx
+	movl	$2116, %edx                     # imm = 0x844
+	movq	%rbx, %rdi
+	xorl	%esi, %esi
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	jmp	memset@PLT                      # TAILCALL
+.LBB57_5:                               # %if.end
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end57:
+	.size	halide_shutdown_thread_pool, .Lfunc_end57-halide_shutdown_thread_pool
+                                        # -- End function
+	.section	.text.halide_join_thread,"ax",@progbits
+	.weak	halide_join_thread              # -- Begin function halide_join_thread
+	.p2align	4, 0x90
+	.type	halide_join_thread,@function
+halide_join_thread:                     # @halide_join_thread
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%rbx
+	pushq	%rax
+	movq	%rdi, %rbx
+	movq	$0, -16(%rbp)
+	movq	16(%rdi), %rdi
+	leaq	-16(%rbp), %rsi
+	callq	pthread_join@PLT
+	movq	%rbx, %rdi
+	callq	free@PLT
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%rbp
+	retq
+.Lfunc_end58:
+	.size	halide_join_thread, .Lfunc_end58-halide_join_thread
+                                        # -- End function
+	.section	.text.halide_cond_signal,"ax",@progbits
+	.weak	halide_cond_signal              # -- Begin function halide_cond_signal
+	.p2align	4, 0x90
+	.type	halide_cond_signal,@function
+halide_cond_signal:                     # @halide_cond_signal
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$32, %rsp
+	movq	(%rdi), %rax
+	testq	%rax, %rax
+	je	.LBB59_2
+# %bb.1:                                # %if.end.i
+	movq	%rdi, %rsi
+	movq	_ZTVN6Halide7Runtime8Internal15Synchronization22signal_parking_controlE@GOTPCREL(%rip), %rcx
+	addq	$16, %rcx
+	movq	%rcx, -24(%rbp)
+	movq	%rdi, -16(%rbp)
+	movq	%rax, -8(%rbp)
+	leaq	-24(%rbp), %rdi
+	callq	_ZN6Halide7Runtime8Internal15Synchronization15parking_control10unpark_oneEy@PLT
+.LBB59_2:                               # %_ZN6Halide7Runtime8Internal15Synchronization9fast_cond6signalEv.exit
+	addq	$32, %rsp
+	popq	%rbp
+	retq
+.Lfunc_end59:
+	.size	halide_cond_signal, .Lfunc_end59-halide_cond_signal
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization15parking_control8validateERNS2_15validate_actionE,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization15parking_control8validateERNS2_15validate_actionE,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization15parking_control8validateERNS2_15validate_actionE # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization15parking_control8validateERNS2_15validate_actionE
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization15parking_control8validateERNS2_15validate_actionE,@function
+_ZN6Halide7Runtime8Internal15Synchronization15parking_control8validateERNS2_15validate_actionE: # @_ZN6Halide7Runtime8Internal15Synchronization15parking_control8validateERNS2_15validate_actionE
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movb	$1, %al
+	popq	%rbp
+	retq
+.Lfunc_end60:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization15parking_control8validateERNS2_15validate_actionE, .Lfunc_end60-_ZN6Halide7Runtime8Internal15Synchronization15parking_control8validateERNS2_15validate_actionE
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal15Synchronization22signal_parking_control6unparkEib,"axG",@progbits,_ZN6Halide7Runtime8Internal15Synchronization22signal_parking_control6unparkEib,comdat
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization22signal_parking_control6unparkEib # -- Begin function _ZN6Halide7Runtime8Internal15Synchronization22signal_parking_control6unparkEib
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal15Synchronization22signal_parking_control6unparkEib,@function
+_ZN6Halide7Runtime8Internal15Synchronization22signal_parking_control6unparkEib: # @_ZN6Halide7Runtime8Internal15Synchronization22signal_parking_control6unparkEib
+# %bb.0:                                # %entry
+	testl	%edx, %edx
+	jne	.LBB61_2
+# %bb.1:                                # %if.then
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	8(%rdi), %rax
+	movq	$0, (%rax)
+	popq	%rbp
+.LBB61_2:                               # %if.end
+	xorl	%eax, %eax
+	retq
+.Lfunc_end61:
+	.size	_ZN6Halide7Runtime8Internal15Synchronization22signal_parking_control6unparkEib, .Lfunc_end61-_ZN6Halide7Runtime8Internal15Synchronization22signal_parking_control6unparkEib
+                                        # -- End function
+	.section	.text.halide_mutex_array_create,"ax",@progbits
+	.weak	halide_mutex_array_create       # -- Begin function halide_mutex_array_create
+	.p2align	4, 0x90
+	.type	halide_mutex_array_create,@function
+halide_mutex_array_create:              # @halide_mutex_array_create
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movl	%edi, %r15d
+	xorl	%r14d, %r14d
+	movl	$8, %esi
+	xorl	%edi, %edi
+	callq	halide_malloc@PLT
+	testq	%rax, %rax
+	je	.LBB62_4
+# %bb.1:                                # %if.end
+	movq	%rax, %rbx
+	movslq	%r15d, %r14
+	shlq	$3, %r14
+	xorl	%edi, %edi
+	movq	%r14, %rsi
+	callq	halide_malloc@PLT
+	movq	%rax, (%rbx)
+	testq	%rax, %rax
+	je	.LBB62_2
+# %bb.3:                                # %if.end6
+	movq	%rax, %rdi
+	xorl	%esi, %esi
+	movq	%r14, %rdx
+	callq	memset@PLT
+	movq	%rbx, %r14
+	jmp	.LBB62_4
+.LBB62_2:                               # %if.then5
+	xorl	%r14d, %r14d
+	xorl	%edi, %edi
+	movq	%rbx, %rsi
+	callq	halide_free@PLT
+.LBB62_4:                               # %cleanup
+	movq	%r14, %rax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end62:
+	.size	halide_mutex_array_create, .Lfunc_end62-halide_mutex_array_create
+                                        # -- End function
+	.section	.text.halide_mutex_array_destroy,"ax",@progbits
+	.weak	halide_mutex_array_destroy      # -- Begin function halide_mutex_array_destroy
+	.p2align	4, 0x90
+	.type	halide_mutex_array_destroy,@function
+halide_mutex_array_destroy:             # @halide_mutex_array_destroy
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rsi, %rbx
+	movq	%rdi, %r14
+	movq	(%rsi), %rsi
+	callq	halide_free@PLT
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	jmp	halide_free@PLT                 # TAILCALL
+.Lfunc_end63:
+	.size	halide_mutex_array_destroy, .Lfunc_end63-halide_mutex_array_destroy
+                                        # -- End function
+	.section	.text.halide_mutex_array_lock,"ax",@progbits
+	.weak	halide_mutex_array_lock         # -- Begin function halide_mutex_array_lock
+	.p2align	4, 0x90
+	.type	halide_mutex_array_lock,@function
+halide_mutex_array_lock:                # @halide_mutex_array_lock
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movslq	%esi, %rax
+	shlq	$3, %rax
+	addq	(%rdi), %rax
+	movq	%rax, %rdi
+	callq	halide_mutex_lock@PLT
+	xorl	%eax, %eax
+	popq	%rbp
+	retq
+.Lfunc_end64:
+	.size	halide_mutex_array_lock, .Lfunc_end64-halide_mutex_array_lock
+                                        # -- End function
+	.section	.text.halide_mutex_array_unlock,"ax",@progbits
+	.weak	halide_mutex_array_unlock       # -- Begin function halide_mutex_array_unlock
+	.p2align	4, 0x90
+	.type	halide_mutex_array_unlock,@function
+halide_mutex_array_unlock:              # @halide_mutex_array_unlock
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movslq	%esi, %rax
+	shlq	$3, %rax
+	addq	(%rdi), %rax
+	movq	%rax, %rdi
+	callq	halide_mutex_unlock@PLT
+	xorl	%eax, %eax
+	popq	%rbp
+	retq
+.Lfunc_end65:
+	.size	halide_mutex_array_unlock, .Lfunc_end65-halide_mutex_array_unlock
+                                        # -- End function
+	.section	.text.halide_set_num_threads,"ax",@progbits
+	.weak	halide_set_num_threads          # -- Begin function halide_set_num_threads
+	.p2align	4, 0x90
+	.type	halide_set_num_threads,@function
+halide_set_num_threads:                 # @halide_set_num_threads
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movl	%edi, %ebx
+	testl	%edi, %edi
+	js	.LBB66_1
+# %bb.2:                                # %if.end
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	testl	%ebx, %ebx
+	jne	.LBB66_4
+# %bb.3:                                # %if.then2
+	callq	_ZN6Halide7Runtime8Internal27default_desired_num_threadsEv@PLT
+	movl	%eax, %ebx
+	jmp	.LBB66_4
+.LBB66_1:                               # %if.end.thread
+	leaq	.L.str.4(%rip), %rsi
+	xorl	%edi, %edi
+	callq	halide_error@PLT
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+.LBB66_4:                               # %if.end3
+	movq	_ZN6Halide7Runtime8Internal10work_queueE@GOTPCREL(%rip), %rdi
+	movl	8(%rdi), %r14d
+	cmpl	$2, %ebx
+	movl	$1, %eax
+	cmovgel	%ebx, %eax
+	cmpl	$256, %eax                      # imm = 0x100
+	movl	$256, %ecx                      # imm = 0x100
+	cmovll	%eax, %ecx
+	movl	%ecx, 8(%rdi)
+	callq	halide_mutex_unlock@PLT
+	movl	%r14d, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end66:
+	.size	halide_set_num_threads, .Lfunc_end66-halide_set_num_threads
+                                        # -- End function
+	.section	.text.halide_set_custom_do_task,"ax",@progbits
+	.weak	halide_set_custom_do_task       # -- Begin function halide_set_custom_do_task
+	.p2align	4, 0x90
+	.type	halide_set_custom_do_task,@function
+halide_set_custom_do_task:              # @halide_set_custom_do_task
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal14custom_do_taskE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end67:
+	.size	halide_set_custom_do_task, .Lfunc_end67-halide_set_custom_do_task
+                                        # -- End function
+	.section	.text.halide_set_custom_do_loop_task,"ax",@progbits
+	.weak	halide_set_custom_do_loop_task  # -- Begin function halide_set_custom_do_loop_task
+	.p2align	4, 0x90
+	.type	halide_set_custom_do_loop_task,@function
+halide_set_custom_do_loop_task:         # @halide_set_custom_do_loop_task
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal19custom_do_loop_taskE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end68:
+	.size	halide_set_custom_do_loop_task, .Lfunc_end68-halide_set_custom_do_loop_task
+                                        # -- End function
+	.section	.text.halide_set_custom_do_par_for,"ax",@progbits
+	.weak	halide_set_custom_do_par_for    # -- Begin function halide_set_custom_do_par_for
+	.p2align	4, 0x90
+	.type	halide_set_custom_do_par_for,@function
+halide_set_custom_do_par_for:           # @halide_set_custom_do_par_for
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal17custom_do_par_forE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end69:
+	.size	halide_set_custom_do_par_for, .Lfunc_end69-halide_set_custom_do_par_for
+                                        # -- End function
+	.section	.text.halide_set_custom_parallel_runtime,"ax",@progbits
+	.weak	halide_set_custom_parallel_runtime # -- Begin function halide_set_custom_parallel_runtime
+	.p2align	4, 0x90
+	.type	halide_set_custom_parallel_runtime,@function
+halide_set_custom_parallel_runtime:     # @halide_set_custom_parallel_runtime
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	16(%rbp), %rax
+	movq	_ZN6Halide7Runtime8Internal17custom_do_par_forE@GOTPCREL(%rip), %r10
+	movq	%rdi, (%r10)
+	movq	_ZN6Halide7Runtime8Internal14custom_do_taskE@GOTPCREL(%rip), %rdi
+	movq	%rsi, (%rdi)
+	movq	_ZN6Halide7Runtime8Internal19custom_do_loop_taskE@GOTPCREL(%rip), %rsi
+	movq	%rdx, (%rsi)
+	movq	_ZN6Halide7Runtime8Internal24custom_do_parallel_tasksE@GOTPCREL(%rip), %rdx
+	movq	%rcx, (%rdx)
+	movq	_ZN6Halide7Runtime8Internal21custom_semaphore_initE@GOTPCREL(%rip), %rcx
+	movq	%r8, (%rcx)
+	movq	_ZN6Halide7Runtime8Internal28custom_semaphore_try_acquireE@GOTPCREL(%rip), %rcx
+	movq	%r9, (%rcx)
+	movq	_ZN6Halide7Runtime8Internal24custom_semaphore_releaseE@GOTPCREL(%rip), %rcx
+	movq	%rax, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end70:
+	.size	halide_set_custom_parallel_runtime, .Lfunc_end70-halide_set_custom_parallel_runtime
+                                        # -- End function
+	.section	.text.halide_do_par_for,"ax",@progbits
+	.weak	halide_do_par_for               # -- Begin function halide_do_par_for
+	.p2align	4, 0x90
+	.type	halide_do_par_for,@function
+halide_do_par_for:                      # @halide_do_par_for
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal17custom_do_par_forE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end71:
+	.size	halide_do_par_for, .Lfunc_end71-halide_do_par_for
+                                        # -- End function
+	.section	.text.halide_do_parallel_tasks,"ax",@progbits
+	.weak	halide_do_parallel_tasks        # -- Begin function halide_do_parallel_tasks
+	.p2align	4, 0x90
+	.type	halide_do_parallel_tasks,@function
+halide_do_parallel_tasks:               # @halide_do_parallel_tasks
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal24custom_do_parallel_tasksE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end72:
+	.size	halide_do_parallel_tasks, .Lfunc_end72-halide_do_parallel_tasks
+                                        # -- End function
+	.section	.text.halide_semaphore_init,"ax",@progbits
+	.weak	halide_semaphore_init           # -- Begin function halide_semaphore_init
+	.p2align	4, 0x90
+	.type	halide_semaphore_init,@function
+halide_semaphore_init:                  # @halide_semaphore_init
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal21custom_semaphore_initE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end73:
+	.size	halide_semaphore_init, .Lfunc_end73-halide_semaphore_init
+                                        # -- End function
+	.section	.text.halide_semaphore_release,"ax",@progbits
+	.weak	halide_semaphore_release        # -- Begin function halide_semaphore_release
+	.p2align	4, 0x90
+	.type	halide_semaphore_release,@function
+halide_semaphore_release:               # @halide_semaphore_release
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal24custom_semaphore_releaseE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end74:
+	.size	halide_semaphore_release, .Lfunc_end74-halide_semaphore_release
+                                        # -- End function
+	.section	.text.halide_semaphore_try_acquire,"ax",@progbits
+	.weak	halide_semaphore_try_acquire    # -- Begin function halide_semaphore_try_acquire
+	.p2align	4, 0x90
+	.type	halide_semaphore_try_acquire,@function
+halide_semaphore_try_acquire:           # @halide_semaphore_try_acquire
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal28custom_semaphore_try_acquireE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end75:
+	.size	halide_semaphore_try_acquire, .Lfunc_end75-halide_semaphore_try_acquire
+                                        # -- End function
+	.section	.text.halide_default_get_symbol,"ax",@progbits
+	.weak	halide_default_get_symbol       # -- Begin function halide_default_get_symbol
+	.p2align	4, 0x90
+	.type	halide_default_get_symbol,@function
+halide_default_get_symbol:              # @halide_default_get_symbol
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	%rdi, %rsi
+	xorl	%edi, %edi
+	popq	%rbp
+	jmp	dlsym@PLT                       # TAILCALL
+.Lfunc_end76:
+	.size	halide_default_get_symbol, .Lfunc_end76-halide_default_get_symbol
+                                        # -- End function
+	.section	.text.halide_default_load_library,"ax",@progbits
+	.weak	halide_default_load_library     # -- Begin function halide_default_load_library
+	.p2align	4, 0x90
+	.type	halide_default_load_library,@function
+halide_default_load_library:            # @halide_default_load_library
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%rbx
+	pushq	%rax
+	movl	$1, %esi
+	callq	dlopen@PLT
+	movq	%rax, %rbx
+	testq	%rax, %rax
+	jne	.LBB77_2
+# %bb.1:                                # %if.then
+	callq	dlerror@PLT
+.LBB77_2:                               # %if.end
+	movq	%rbx, %rax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%rbp
+	retq
+.Lfunc_end77:
+	.size	halide_default_load_library, .Lfunc_end77-halide_default_load_library
+                                        # -- End function
+	.section	.text.halide_default_get_library_symbol,"ax",@progbits
+	.weak	halide_default_get_library_symbol # -- Begin function halide_default_get_library_symbol
+	.p2align	4, 0x90
+	.type	halide_default_get_library_symbol,@function
+halide_default_get_library_symbol:      # @halide_default_get_library_symbol
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	jmp	dlsym@PLT                       # TAILCALL
+.Lfunc_end78:
+	.size	halide_default_get_library_symbol, .Lfunc_end78-halide_default_get_library_symbol
+                                        # -- End function
+	.section	.text.halide_set_custom_get_symbol,"ax",@progbits
+	.weak	halide_set_custom_get_symbol    # -- Begin function halide_set_custom_get_symbol
+	.p2align	4, 0x90
+	.type	halide_set_custom_get_symbol,@function
+halide_set_custom_get_symbol:           # @halide_set_custom_get_symbol
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal17custom_get_symbolE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end79:
+	.size	halide_set_custom_get_symbol, .Lfunc_end79-halide_set_custom_get_symbol
+                                        # -- End function
+	.section	.text.halide_set_custom_load_library,"ax",@progbits
+	.weak	halide_set_custom_load_library  # -- Begin function halide_set_custom_load_library
+	.p2align	4, 0x90
+	.type	halide_set_custom_load_library,@function
+halide_set_custom_load_library:         # @halide_set_custom_load_library
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal19custom_load_libraryE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end80:
+	.size	halide_set_custom_load_library, .Lfunc_end80-halide_set_custom_load_library
+                                        # -- End function
+	.section	.text.halide_set_custom_get_library_symbol,"ax",@progbits
+	.weak	halide_set_custom_get_library_symbol # -- Begin function halide_set_custom_get_library_symbol
+	.p2align	4, 0x90
+	.type	halide_set_custom_get_library_symbol,@function
+halide_set_custom_get_library_symbol:   # @halide_set_custom_get_library_symbol
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal25custom_get_library_symbolE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end81:
+	.size	halide_set_custom_get_library_symbol, .Lfunc_end81-halide_set_custom_get_library_symbol
+                                        # -- End function
+	.section	.text.halide_get_symbol,"ax",@progbits
+	.weak	halide_get_symbol               # -- Begin function halide_get_symbol
+	.p2align	4, 0x90
+	.type	halide_get_symbol,@function
+halide_get_symbol:                      # @halide_get_symbol
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal17custom_get_symbolE@GOTPCREL(%rip), %rax
+	popq	%rbp
+	jmpq	*(%rax)                         # TAILCALL
+.Lfunc_end82:
+	.size	halide_get_symbol, .Lfunc_end82-halide_get_symbol
+                                        # -- End function
+	.section	.text.halide_load_library,"ax",@progbits
+	.weak	halide_load_library             # -- Begin function halide_load_library
+	.p2align	4, 0x90
+	.type	halide_load_library,@function
+halide_load_library:                    # @halide_load_library
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal19custom_load_libraryE@GOTPCREL(%rip), %rax
+	popq	%rbp
+	jmpq	*(%rax)                         # TAILCALL
+.Lfunc_end83:
+	.size	halide_load_library, .Lfunc_end83-halide_load_library
+                                        # -- End function
+	.section	.text.halide_get_library_symbol,"ax",@progbits
+	.weak	halide_get_library_symbol       # -- Begin function halide_get_library_symbol
+	.p2align	4, 0x90
+	.type	halide_get_library_symbol,@function
+halide_get_library_symbol:              # @halide_get_library_symbol
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal25custom_get_library_symbolE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end84:
+	.size	halide_get_library_symbol, .Lfunc_end84-halide_get_library_symbol
+                                        # -- End function
+	.section	.text.halide_set_gpu_device,"ax",@progbits
+	.weak	halide_set_gpu_device           # -- Begin function halide_set_gpu_device
+	.p2align	4, 0x90
+	.type	halide_set_gpu_device,@function
+halide_set_gpu_device:                  # @halide_set_gpu_device
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal17halide_gpu_deviceE@GOTPCREL(%rip), %rax
+	movl	%edi, (%rax)
+	movq	_ZN6Halide7Runtime8Internal29halide_gpu_device_initializedE@GOTPCREL(%rip), %rax
+	movb	$1, (%rax)
+	popq	%rbp
+	retq
+.Lfunc_end85:
+	.size	halide_set_gpu_device, .Lfunc_end85-halide_set_gpu_device
+                                        # -- End function
+	.section	.text.halide_get_gpu_device,"ax",@progbits
+	.weak	halide_get_gpu_device           # -- Begin function halide_get_gpu_device
+	.p2align	4, 0x90
+	.type	halide_get_gpu_device,@function
+halide_get_gpu_device:                  # @halide_get_gpu_device
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	_ZN6Halide7Runtime8Internal22halide_gpu_device_lockE@GOTPCREL(%rip), %rbx
+	.p2align	4, 0x90
+.LBB86_1:                               # %while.cond.i
+                                        # =>This Inner Loop Header: Depth=1
+	movb	$1, %al
+	xchgb	%al, (%rbx)
+	testb	%al, %al
+	jne	.LBB86_1
+# %bb.2:                                # %_ZN6Halide7Runtime8Internal14ScopedSpinLockC2EPVc.exit
+	movq	_ZN6Halide7Runtime8Internal29halide_gpu_device_initializedE@GOTPCREL(%rip), %r14
+	cmpb	$0, (%r14)
+	je	.LBB86_4
+# %bb.3:                                # %_ZN6Halide7Runtime8Internal14ScopedSpinLockC2EPVc.exit.if.end4_crit_edge
+	movq	_ZN6Halide7Runtime8Internal17halide_gpu_deviceE@GOTPCREL(%rip), %rax
+	movl	(%rax), %eax
+	jmp	.LBB86_8
+.LBB86_4:                               # %if.then
+	leaq	.L.str.8(%rip), %rdi
+	callq	getenv@PLT
+	testq	%rax, %rax
+	je	.LBB86_5
+# %bb.6:                                # %if.then2
+	movq	%rax, %rdi
+	callq	atoi@PLT
+	jmp	.LBB86_7
+.LBB86_5:
+	movl	$-1, %eax
+.LBB86_7:                               # %if.end
+	movq	_ZN6Halide7Runtime8Internal17halide_gpu_deviceE@GOTPCREL(%rip), %rcx
+	movl	%eax, (%rcx)
+	movb	$1, (%r14)
+.LBB86_8:                               # %if.end4
+	movb	$0, (%rbx)
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end86:
+	.size	halide_get_gpu_device, .Lfunc_end86-halide_get_gpu_device
+                                        # -- End function
+	.section	.text.halide_default_trace,"ax",@progbits
+	.weak	halide_default_trace            # -- Begin function halide_default_trace
+	.p2align	4, 0x90
+	.type	halide_default_trace,@function
+halide_default_trace:                   # @halide_default_trace
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$72, %rsp
+	movq	%rsi, %r14
+	movl	$1, %r12d
+	movl	$1, %ebx
+	lock		xaddl	%ebx, _ZZ20halide_default_traceE3ids(%rip)
+	movq	%rdi, -72(%rbp)                 # 8-byte Spill
+	callq	halide_get_trace_file@PLT
+	movl	%eax, -64(%rbp)                 # 4-byte Spill
+	testl	%eax, %eax
+	movq	%r14, -56(%rbp)                 # 8-byte Spill
+	movl	%ebx, -76(%rbp)                 # 4-byte Spill
+	jle	.LBB87_40
+# %bb.1:                                # %if.then
+	movzwl	34(%r14), %eax
+	movzbl	33(%r14), %ebx
+	addq	$7, %rbx
+	shrq	$3, %rbx
+	imulq	%rax, %rbx
+	movl	48(%r14), %r15d
+	shll	$2, %r15d
+	movq	(%r14), %rdi
+	callq	strlen@PLT
+	movq	%rax, %r13
+	incl	%r13d
+	movq	24(%r14), %rdi
+	testq	%rdi, %rdi
+	je	.LBB87_3
+# %bb.2:                                # %cond.true
+	callq	strlen@PLT
+	movq	%rax, %r12
+	incl	%r12d
+.LBB87_3:                               # %cond.end
+	movq	%rbx, -96(%rbp)                 # 8-byte Spill
+	movq	%r15, -88(%rbp)                 # 8-byte Spill
+	leal	(%r15,%rbx), %eax
+	addl	%r13d, %eax
+	leal	(%r12,%rax), %r15d
+	addl	$31, %r15d
+	andl	$-4, %r15d
+	movq	_ZN6Halide7Runtime8Internal19halide_trace_bufferE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rbx
+	leaq	12(%rbx), %rax
+	movq	%rax, -48(%rbp)                 # 8-byte Spill
+	cmpl	$1048577, %r15d                 # imm = 0x100001
+	movq	%r12, -112(%rbp)                # 8-byte Spill
+	movq	%r13, -104(%rbp)                # 8-byte Spill
+	jae	.LBB87_4
+# %bb.12:                               # %while.body.i.i.us.i.preheader
+	movl	$1073741823, %r13d              # imm = 0x3FFFFFFF
+	movl	$-2147483648, %r14d             # imm = 0x80000000
+	jmp	.LBB87_13
+.LBB87_20:                              # %do.end.critedge.i.us.i
+                                        #   in Loop: Header=BB87_13 Depth=1
+	lock		andl	$2147483647, (%rbx)     # imm = 0x7FFFFFFF
+	.p2align	4, 0x90
+.LBB87_13:                              # %while.body.i.i.us.i
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB87_16 Depth 2
+	movl	(%rbx), %eax
+	andl	%r13d, %eax
+	leal	1(%rax), %ecx
+                                        # kill: def $eax killed $eax killed $rax
+	lock		cmpxchgl	%ecx, (%rbx)
+	jne	.LBB87_13
+# %bb.14:                               # %_ZN6Halide7Runtime8Internal23SharedExclusiveSpinLock14acquire_sharedEv.exit.i.us.i
+                                        #   in Loop: Header=BB87_13 Depth=1
+	movl	%r15d, %eax
+	lock		xaddl	%eax, 4(%rbx)
+	leal	(%rax,%r15), %ecx
+	cmpl	$1048577, %ecx                  # imm = 0x100001
+	jb	.LBB87_22
+# %bb.15:                               # %while.body.us.i
+                                        #   in Loop: Header=BB87_13 Depth=1
+	lock		addl	%r15d, 8(%rbx)
+	lock		decl	(%rbx)
+	.p2align	4, 0x90
+.LBB87_16:                              # %while.body.i.i3.us.i
+                                        #   Parent Loop BB87_13 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	lock		orl	$1073741824, (%rbx)     # imm = 0x40000000
+	movl	$1073741824, %eax               # imm = 0x40000000
+	lock		cmpxchgl	%r14d, (%rbx)
+	jne	.LBB87_16
+# %bb.17:                               # %_ZN6Halide7Runtime8Internal23SharedExclusiveSpinLock17acquire_exclusiveEv.exit.i.us.i
+                                        #   in Loop: Header=BB87_13 Depth=1
+	movl	4(%rbx), %r12d
+	testl	%r12d, %r12d
+	je	.LBB87_20
+# %bb.18:                               # %if.then.i8.us.i
+                                        #   in Loop: Header=BB87_13 Depth=1
+	subl	8(%rbx), %r12d
+	movl	%r12d, 4(%rbx)
+	movl	-64(%rbp), %edi                 # 4-byte Reload
+	movq	-48(%rbp), %rsi                 # 8-byte Reload
+	movq	%r12, %rdx
+	callq	write@PLT
+	movq	$0, 4(%rbx)
+	lock		andl	$2147483647, (%rbx)     # imm = 0x7FFFFFFF
+	cmpl	%eax, %r12d
+	je	.LBB87_13
+# %bb.19:                               # %if.then10.i.us.i
+                                        #   in Loop: Header=BB87_13 Depth=1
+	movq	-72(%rbp), %rdi                 # 8-byte Reload
+	leaq	.L.str.32(%rip), %rsi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	jmp	.LBB87_13
+.LBB87_40:                              # %if.else
+	movl	$4096, %edi                     # imm = 0x1000
+	callq	malloc@PLT
+	movq	%rax, %r12
+	testq	%rax, %rax
+	je	.LBB87_41
+# %bb.42:                               # %if.then6.i430
+	leaq	4095(%r12), %r13
+	movb	$0, 4095(%r12)
+	jmp	.LBB87_43
+.LBB87_4:
+	movl	$1073741823, %r14d              # imm = 0x3FFFFFFF
+	movl	$-2147483648, %r13d             # imm = 0x80000000
+	jmp	.LBB87_5
+.LBB87_21:                              # %do.end.critedge.i.i
+                                        #   in Loop: Header=BB87_5 Depth=1
+	lock		andl	$2147483647, (%rbx)     # imm = 0x7FFFFFFF
+	.p2align	4, 0x90
+.LBB87_5:                               # %while.body.i.i.i
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB87_8 Depth 2
+	movl	(%rbx), %eax
+	andl	%r14d, %eax
+	leal	1(%rax), %ecx
+                                        # kill: def $eax killed $eax killed $rax
+	lock		cmpxchgl	%ecx, (%rbx)
+	jne	.LBB87_5
+# %bb.6:                                # %_ZN6Halide7Runtime8Internal23SharedExclusiveSpinLock14acquire_sharedEv.exit.i.i
+                                        #   in Loop: Header=BB87_5 Depth=1
+	movq	-72(%rbp), %rdi                 # 8-byte Reload
+	leaq	.L.str.31(%rip), %rsi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	movl	%r15d, %eax
+	lock		xaddl	%eax, 4(%rbx)
+	leal	(%rax,%r15), %ecx
+	cmpl	$1048577, %ecx                  # imm = 0x100001
+	jb	.LBB87_22
+# %bb.7:                                # %while.body.i
+                                        #   in Loop: Header=BB87_5 Depth=1
+	lock		addl	%r15d, 8(%rbx)
+	lock		decl	(%rbx)
+	.p2align	4, 0x90
+.LBB87_8:                               # %while.body.i.i3.i
+                                        #   Parent Loop BB87_5 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	lock		orl	$1073741824, (%rbx)     # imm = 0x40000000
+	movl	$1073741824, %eax               # imm = 0x40000000
+	lock		cmpxchgl	%r13d, (%rbx)
+	jne	.LBB87_8
+# %bb.9:                                # %_ZN6Halide7Runtime8Internal23SharedExclusiveSpinLock17acquire_exclusiveEv.exit.i.i
+                                        #   in Loop: Header=BB87_5 Depth=1
+	movl	4(%rbx), %r12d
+	testl	%r12d, %r12d
+	je	.LBB87_21
+# %bb.10:                               # %if.then.i8.i
+                                        #   in Loop: Header=BB87_5 Depth=1
+	subl	8(%rbx), %r12d
+	movl	%r12d, 4(%rbx)
+	movl	-64(%rbp), %edi                 # 4-byte Reload
+	movq	-48(%rbp), %rsi                 # 8-byte Reload
+	movq	%r12, %rdx
+	callq	write@PLT
+	movq	$0, 4(%rbx)
+	lock		andl	$2147483647, (%rbx)     # imm = 0x7FFFFFFF
+	cmpl	%eax, %r12d
+	je	.LBB87_5
+# %bb.11:                               # %if.then10.i.i
+                                        #   in Loop: Header=BB87_5 Depth=1
+	movq	-72(%rbp), %rdi                 # 8-byte Reload
+	leaq	.L.str.32(%rip), %rsi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	jmp	.LBB87_5
+.LBB87_22:                              # %_ZN6Halide7Runtime8Internal11TraceBuffer14acquire_packetEPvij.exit
+	movl	%eax, %eax
+	movq	-48(%rbp), %r12                 # 8-byte Reload
+	addq	%rax, %r12
+	cmpl	$4097, %r15d                    # imm = 0x1001
+	movq	-112(%rbp), %r13                # 8-byte Reload
+	jb	.LBB87_27
+# %bb.23:                               # %if.then18
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB87_24
+# %bb.25:                               # %if.else.i
+	movq	%r12, %rbx
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	movl	%r15d, %edx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	leaq	.L.str.7.166(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%rbx, %r12
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	xorl	%edi, %edi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	xorl	%edi, %edi
+	movq	%r14, %rsi
+	callq	halide_print@PLT
+	jmp	.LBB87_26
+.LBB87_41:
+	xorl	%r13d, %r13d
+.LBB87_43:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE2ELy4096EEC2EPvPc.exit
+	movzbl	33(%r14), %eax
+	movl	$8, %ecx
+	.p2align	4, 0x90
+.LBB87_44:                              # %while.cond
+                                        # =>This Inner Loop Header: Depth=1
+	movl	%ecx, %r14d
+	leal	(%r14,%r14), %ecx
+	cmpl	%eax, %r14d
+	jl	.LBB87_44
+# %bb.45:                               # %do.body
+	cmpl	$65, %r14d
+	jl	.LBB87_47
+# %bb.46:                               # %if.then64
+	leaq	.L.str.2.11(%rip), %rsi
+	movq	-72(%rbp), %rdi                 # 8-byte Reload
+	callq	halide_print@PLT
+	callq	abort@PLT
+.LBB87_47:                              # %do.end
+	movq	-56(%rbp), %r15                 # 8-byte Reload
+	movl	36(%r15), %ecx
+	leaq	.Lreltable.halide_default_trace(%rip), %rax
+	movq	%rcx, -48(%rbp)                 # 8-byte Spill
+	movslq	(%rax,%rcx,4), %rdx
+	addq	%rax, %rdx
+	movq	%r12, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.20.179(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	(%r15), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.30.142(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	44(%r15), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.22.181(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	cmpw	$2, 34(%r15)
+	movq	%r12, -64(%rbp)                 # 8-byte Spill
+	jb	.LBB87_49
+# %bb.48:                               # %if.then81
+	leaq	.L.str.17(%rip), %rdx
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+.LBB87_49:                              # %if.end83
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+	cmpl	$0, 48(%r12)
+	jle	.LBB87_52
+# %bb.50:                               # %if.end101.peel
+	movq	16(%r12), %rax
+	movslq	(%rax), %rdx
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	cmpl	$2, 48(%r12)
+	jl	.LBB87_51
+# %bb.56:                               # %if.then87.preheader
+	movl	$1, %ebx
+	leaq	.L.str.55(%rip), %r15
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+	jmp	.LBB87_57
+	.p2align	4, 0x90
+.LBB87_59:                              # %if.else98.split
+                                        #   in Loop: Header=BB87_57 Depth=1
+	movq	%r15, %rdx
+.LBB87_60:                              # %if.end101
+                                        #   in Loop: Header=BB87_57 Depth=1
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+	movq	16(%r12), %rcx
+	movslq	(%rcx,%rbx,4), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	incq	%rbx
+	movslq	48(%r12), %rax
+	cmpq	%rax, %rbx
+	jge	.LBB87_52
+.LBB87_57:                              # %if.then87
+                                        # =>This Inner Loop Header: Depth=1
+	movzwl	34(%r12), %ecx
+	cmpl	$2, %ecx
+	jb	.LBB87_59
+# %bb.58:                               # %land.lhs.true
+                                        #   in Loop: Header=BB87_57 Depth=1
+	movl	%ebx, %eax
+	xorl	%edx, %edx
+	divl	%ecx
+	movl	%edx, %eax
+	leaq	.L.str.18(%rip), %rdx
+	testl	%eax, %eax
+	jne	.LBB87_59
+	jmp	.LBB87_60
+.LBB87_24:                              # %if.then.i419
+	movl	%r15d, %edx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	leaq	.L.str.7.166(%rip), %rdx
+	movq	%rax, %rdi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+	xorl	%edi, %edi
+	callq	halide_error@PLT
+.LBB87_26:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE0ELy1024EED2Ev.exit
+	movq	%r14, %rdi
+	callq	free@PLT
+.LBB87_27:                              # %if.end
+	movl	%r15d, (%r12)
+	movl	-76(%rbp), %r15d                # 4-byte Reload
+	movl	%r15d, 4(%r12)
+	movq	-56(%rbp), %rbx                 # 8-byte Reload
+	movl	32(%rbx), %eax
+	movl	%eax, 8(%r12)
+	vmovups	36(%rbx), %xmm0
+	vmovups	%xmm0, 12(%r12)
+	movq	16(%rbx), %rsi
+	testq	%rsi, %rsi
+	je	.LBB87_29
+# %bb.28:                               # %if.then29
+	leaq	28(%r12), %rdi
+	movl	-88(%rbp), %edx                 # 4-byte Reload
+	callq	memcpy@PLT
+.LBB87_29:                              # %if.end34
+	movq	8(%rbx), %rsi
+	testq	%rsi, %rsi
+	je	.LBB87_31
+# %bb.30:                               # %if.then36
+	movslq	24(%r12), %rax
+	leaq	(%r12,%rax,4), %rdi
+	addq	$28, %rdi
+	movq	-96(%rbp), %rdx                 # 8-byte Reload
+	callq	memcpy@PLT
+.LBB87_31:                              # %if.end41
+	movslq	24(%r12), %rax
+	leaq	(%r12,%rax,4), %rax
+	addq	$28, %rax
+	movzwl	10(%r12), %ecx
+	movzbl	9(%r12), %edi
+	addq	$7, %rdi
+	shrq	$3, %rdi
+	imulq	%rcx, %rdi
+	addq	%rax, %rdi
+	movq	(%rbx), %rsi
+	movl	-104(%rbp), %edx                # 4-byte Reload
+	callq	memcpy@PLT
+	movslq	24(%r12), %rax
+	leaq	(%r12,%rax,4), %rax
+	addq	$28, %rax
+	movzwl	10(%r12), %ecx
+	movzbl	9(%r12), %edi
+	addq	$7, %rdi
+	shrq	$3, %rdi
+	imulq	%rcx, %rdi
+	addq	%rax, %rdi
+	.p2align	4, 0x90
+.LBB87_32:                              # %while.cond.i402
+                                        # =>This Inner Loop Header: Depth=1
+	cmpb	$0, (%rdi)
+	leaq	1(%rdi), %rdi
+	jne	.LBB87_32
+# %bb.33:                               # %_ZN21halide_trace_packet_t9trace_tagEv.exit
+	movq	24(%rbx), %rax
+	testq	%rax, %rax
+	leaq	.L.str.1.10(%rip), %rsi
+	cmovneq	%rax, %rsi
+	movl	%r13d, %edx
+	callq	memcpy@PLT
+	movq	_ZN6Halide7Runtime8Internal19halide_trace_bufferE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	mfence
+	lock		decl	(%rax)
+	cmpl	$9, 36(%rbx)
+	jne	.LBB87_132
+# %bb.34:                               # %if.then58
+	movq	(%rcx), %rbx
+	movl	$-2147483648, %ecx              # imm = 0x80000000
+	.p2align	4, 0x90
+.LBB87_35:                              # %while.body.i.i
+                                        # =>This Inner Loop Header: Depth=1
+	lock		orl	$1073741824, (%rbx)     # imm = 0x40000000
+	movl	$1073741824, %eax               # imm = 0x40000000
+	lock		cmpxchgl	%ecx, (%rbx)
+	jne	.LBB87_35
+# %bb.36:                               # %_ZN6Halide7Runtime8Internal23SharedExclusiveSpinLock17acquire_exclusiveEv.exit.i
+	movl	4(%rbx), %r14d
+	testl	%r14d, %r14d
+	je	.LBB87_39
+# %bb.37:                               # %if.then.i
+	subl	8(%rbx), %r14d
+	movl	%r14d, 4(%rbx)
+	leaq	12(%rbx), %rsi
+	movl	-64(%rbp), %edi                 # 4-byte Reload
+	movq	%r14, %rdx
+	callq	write@PLT
+	movq	$0, 4(%rbx)
+	lock		andl	$2147483647, (%rbx)     # imm = 0x7FFFFFFF
+	cmpl	%eax, %r14d
+	je	.LBB87_132
+# %bb.38:                               # %if.then10.i
+	leaq	.L.str.32(%rip), %rsi
+	movq	-72(%rbp), %rdi                 # 8-byte Reload
+	callq	halide_print@PLT
+	callq	abort@PLT
+	jmp	.LBB87_132
+.LBB87_51:
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+.LBB87_52:                              # %for.cond.cleanup
+	cmpw	$2, 34(%r12)
+	leaq	.L.str.20(%rip), %rax
+	leaq	.L.str.8.120(%rip), %rdx
+	cmovaeq	%rax, %rdx
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rbx
+	cmpl	$2, -48(%rbp)                   # 4-byte Folded Reload
+	jge	.LBB87_95
+# %bb.53:                               # %if.then116
+	cmpw	$2, 34(%r12)
+	leaq	.L.str.22(%rip), %rax
+	leaq	.L.str.23(%rip), %rdx
+	cmovaeq	%rax, %rdx
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rbx
+	cmpw	$0, 34(%r12)
+	je	.LBB87_95
+# %bb.54:                               # %if.end137.peel
+	leaq	8(%r12), %rax
+	movq	%rax, -48(%rbp)                 # 8-byte Spill
+	movzbl	32(%r12), %eax
+	cmpq	$3, %rax
+	ja	.LBB87_88
+# %bb.55:                               # %if.end137.peel
+	leaq	.LJTI87_0(%rip), %rcx
+	movslq	(%rcx,%rax,4), %rax
+	addq	%rcx, %rax
+	jmpq	*%rax
+.LBB87_79:                              # %if.then141.peel
+	cmpl	$8, %r14d
+	je	.LBB87_84
+# %bb.80:                               # %if.then141.peel
+	cmpl	$16, %r14d
+	je	.LBB87_83
+# %bb.81:                               # %if.then141.peel
+	cmpl	$32, %r14d
+	jne	.LBB87_85
+# %bb.82:                               # %if.then159.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movslq	(%rax), %rdx
+	jmp	.LBB87_86
+.LBB87_39:                              # %do.end.critedge.i
+	lock		andl	$2147483647, (%rbx)     # imm = 0x7FFFFFFF
+	jmp	.LBB87_132
+.LBB87_71:                              # %if.then177.peel
+	cmpl	$8, %r14d
+	je	.LBB87_77
+# %bb.72:                               # %if.then177.peel
+	cmpl	$16, %r14d
+	je	.LBB87_76
+# %bb.73:                               # %if.then177.peel
+	cmpl	$32, %r14d
+	jne	.LBB87_78
+# %bb.74:                               # %if.then195.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movl	(%rax), %edx
+	jmp	.LBB87_75
+.LBB87_62:                              # %do.body214.peel
+	cmpl	$15, %r14d
+	jg	.LBB87_64
+# %bb.63:                               # %if.then216.peel
+	leaq	.L.str.24(%rip), %rsi
+	movq	-72(%rbp), %rdi                 # 8-byte Reload
+	callq	halide_print@PLT
+	callq	abort@PLT
+.LBB87_64:                              # %do.end219.peel
+	cmpl	$32, %r14d
+	je	.LBB87_67
+# %bb.65:                               # %do.end219.peel
+	cmpl	$16, %r14d
+	jne	.LBB87_68
+# %bb.66:                               # %if.then228.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movzwl	(%rax), %edi
+	callq	halide_float16_bits_to_double@PLT
+	jmp	.LBB87_69
+.LBB87_61:                              # %if.then245.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movq	(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	callq	halide_pointer_to_string@PLT
+	jmp	.LBB87_87
+.LBB87_67:                              # %if.then221.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	vmovss	(%rax), %xmm0                   # xmm0 = mem[0],zero,zero,zero
+	vcvtss2sd	%xmm0, %xmm0, %xmm0
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	xorl	%edx, %edx
+	callq	halide_double_to_string@PLT
+	jmp	.LBB87_87
+.LBB87_68:                              # %if.else233.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	vmovsd	(%rax), %xmm0                   # xmm0 = mem[0],zero
+.LBB87_69:                              # %for.inc254.peel
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	movl	$1, %edx
+	callq	halide_double_to_string@PLT
+	jmp	.LBB87_87
+.LBB87_84:                              # %if.then143.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movsbq	(%rax), %rdx
+	jmp	.LBB87_86
+.LBB87_83:                              # %if.then151.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movswq	(%rax), %rdx
+	jmp	.LBB87_86
+.LBB87_85:                              # %if.else164.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movq	(%rax), %rdx
+	jmp	.LBB87_86
+.LBB87_77:                              # %if.then179.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movzbl	(%rax), %edx
+	jmp	.LBB87_86
+.LBB87_76:                              # %if.then187.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movzwl	(%rax), %edx
+.LBB87_86:                              # %for.inc254.peel
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	jmp	.LBB87_87
+.LBB87_78:                              # %if.else200.peel
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movq	(%rax), %rdx
+.LBB87_75:                              # %for.inc254.peel
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+.LBB87_87:                              # %for.inc254.peel
+	movq	%rax, %rbx
+.LBB87_88:                              # %for.inc254.peel
+	movq	-56(%rbp), %rax                 # 8-byte Reload
+	cmpw	$2, 34(%rax)
+	jb	.LBB87_94
+# %bb.89:                               # %if.end137.preheader
+	movl	$1, %r12d
+	movq	-56(%rbp), %r15                 # 8-byte Reload
+	jmp	.LBB87_90
+.LBB87_114:                             # %if.then187
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movzwl	(%rax,%r12,2), %edx
+	.p2align	4, 0x90
+.LBB87_106:                             # %for.inc254
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+.LBB87_128:                             # %for.inc254
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	%rax, %rbx
+.LBB87_129:                             # %for.inc254
+                                        #   in Loop: Header=BB87_90 Depth=1
+	incq	%r12
+	movq	-56(%rbp), %r15                 # 8-byte Reload
+	movzwl	34(%r15), %eax
+	cmpq	%rax, %r12
+	jae	.LBB87_92
+.LBB87_90:                              # %if.end137
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	leaq	.L.str.55(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rbx
+	movzbl	32(%r15), %eax
+	cmpq	$3, %rax
+	ja	.LBB87_129
+# %bb.91:                               # %if.end137
+                                        #   in Loop: Header=BB87_90 Depth=1
+	leaq	.LJTI87_1(%rip), %rcx
+	movslq	(%rcx,%rax,4), %rax
+	addq	%rcx, %rax
+	jmpq	*%rax
+.LBB87_102:                             # %if.then141
+                                        #   in Loop: Header=BB87_90 Depth=1
+	cmpl	$32, %r14d
+	je	.LBB87_108
+# %bb.103:                              # %if.then141
+                                        #   in Loop: Header=BB87_90 Depth=1
+	cmpl	$16, %r14d
+	je	.LBB87_107
+# %bb.104:                              # %if.then141
+                                        #   in Loop: Header=BB87_90 Depth=1
+	cmpl	$8, %r14d
+	jne	.LBB87_109
+# %bb.105:                              # %if.then143
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movsbq	(%rax,%r12), %rdx
+	jmp	.LBB87_106
+	.p2align	4, 0x90
+.LBB87_110:                             # %if.then177
+                                        #   in Loop: Header=BB87_90 Depth=1
+	cmpl	$32, %r14d
+	je	.LBB87_115
+# %bb.111:                              # %if.then177
+                                        #   in Loop: Header=BB87_90 Depth=1
+	cmpl	$16, %r14d
+	je	.LBB87_114
+# %bb.112:                              # %if.then177
+                                        #   in Loop: Header=BB87_90 Depth=1
+	cmpl	$8, %r14d
+	jne	.LBB87_117
+# %bb.113:                              # %if.then179
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movzbl	(%rax,%r12), %edx
+	jmp	.LBB87_106
+	.p2align	4, 0x90
+.LBB87_118:                             # %do.body214
+                                        #   in Loop: Header=BB87_90 Depth=1
+	cmpl	$15, %r14d
+	jg	.LBB87_120
+# %bb.119:                              # %if.then216
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-72(%rbp), %rdi                 # 8-byte Reload
+	leaq	.L.str.24(%rip), %rsi
+	callq	halide_print@PLT
+	callq	abort@PLT
+.LBB87_120:                             # %do.end219
+                                        #   in Loop: Header=BB87_90 Depth=1
+	cmpl	$16, %r14d
+	je	.LBB87_123
+# %bb.121:                              # %do.end219
+                                        #   in Loop: Header=BB87_90 Depth=1
+	cmpl	$32, %r14d
+	jne	.LBB87_124
+# %bb.122:                              # %if.then221
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	vmovss	(%rax,%r12,4), %xmm0            # xmm0 = mem[0],zero,zero,zero
+	vcvtss2sd	%xmm0, %xmm0, %xmm0
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	xorl	%edx, %edx
+	callq	halide_double_to_string@PLT
+	jmp	.LBB87_128
+	.p2align	4, 0x90
+.LBB87_127:                             # %if.then245
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movq	(%rax,%r12,8), %rdx
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	callq	halide_pointer_to_string@PLT
+	jmp	.LBB87_128
+.LBB87_123:                             # %if.then228
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movzwl	(%rax,%r12,2), %edi
+	callq	halide_float16_bits_to_double@PLT
+	jmp	.LBB87_125
+.LBB87_124:                             # %if.else233
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	vmovsd	(%rax,%r12,8), %xmm0            # xmm0 = mem[0],zero
+.LBB87_125:                             # %for.inc254
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	movl	$1, %edx
+	callq	halide_double_to_string@PLT
+	jmp	.LBB87_128
+.LBB87_108:                             # %if.then159
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movslq	(%rax,%r12,4), %rdx
+	jmp	.LBB87_106
+.LBB87_107:                             # %if.then151
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movswq	(%rax,%r12,2), %rdx
+	jmp	.LBB87_106
+.LBB87_115:                             # %if.then195
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movl	(%rax,%r12,4), %edx
+	jmp	.LBB87_116
+.LBB87_109:                             # %if.else164
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movq	(%rax,%r12,8), %rdx
+	jmp	.LBB87_106
+.LBB87_117:                             # %if.else200
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	movq	(%rax,%r12,8), %rdx
+.LBB87_116:                             # %for.inc254
+                                        #   in Loop: Header=BB87_90 Depth=1
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	jmp	.LBB87_128
+.LBB87_92:                              # %for.cond.cleanup132
+	cmpw	$1, %ax
+	jbe	.LBB87_94
+# %bb.93:                               # %if.then261
+	leaq	.L.str.25(%rip), %rdx
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rbx
+.LBB87_94:                              # %if.end264
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+.LBB87_95:                              # %if.end264
+	movq	24(%r12), %rax
+	testq	%rax, %rax
+	je	.LBB87_98
+# %bb.96:                               # %land.lhs.true267
+	cmpb	$0, (%rax)
+	je	.LBB87_98
+# %bb.97:                               # %if.then270
+	leaq	.L.str.26(%rip), %rdx
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	24(%r12), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.27(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rbx
+.LBB87_98:                              # %if.end275
+	movq	-64(%rbp), %r15                 # 8-byte Reload
+	leaq	.L.str.7.166(%rip), %rdx
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rbx
+	movq	_ZN6Halide7Runtime8Internal22halide_trace_file_lockE@GOTPCREL(%rip), %r14
+	.p2align	4, 0x90
+.LBB87_99:                              # %while.cond.i406
+                                        # =>This Inner Loop Header: Depth=1
+	movb	$1, %al
+	xchgb	%al, (%r14)
+	testb	%al, %al
+	jne	.LBB87_99
+# %bb.100:                              # %_ZN6Halide7Runtime8Internal14ScopedSpinLockC2EPVc.exit
+	testq	%r15, %r15
+	je	.LBB87_101
+# %bb.130:                              # %if.else.i555
+	subq	%r15, %rbx
+	incq	%rbx
+	movq	-72(%rbp), %r12                 # 8-byte Reload
+	movq	%r12, %rdi
+	movq	-64(%rbp), %rsi                 # 8-byte Reload
+	movq	%rbx, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r12, %rdi
+	movq	-64(%rbp), %rsi                 # 8-byte Reload
+	callq	halide_print@PLT
+	movb	$0, (%r14)
+	movq	%r12, %rdi
+	movq	-64(%rbp), %r14                 # 8-byte Reload
+	movq	%r14, %rsi
+	movq	%rbx, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	jmp	.LBB87_131
+.LBB87_101:                             # %if.then.i548
+	leaq	.L.str.29.165(%rip), %rbx
+	movq	-72(%rbp), %r12                 # 8-byte Reload
+	movq	%r12, %rdi
+	movq	%rbx, %rsi
+	callq	halide_print@PLT
+	movb	$0, (%r14)
+	movq	%r12, %rdi
+	movq	-64(%rbp), %r14                 # 8-byte Reload
+	movq	%rbx, %rsi
+	callq	halide_error@PLT
+.LBB87_131:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE2ELy4096EED2Ev.exit
+	movl	-76(%rbp), %r15d                # 4-byte Reload
+	movq	%r14, %rdi
+	callq	free@PLT
+.LBB87_132:                             # %if.end278
+	movl	%r15d, %eax
+	addq	$72, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end87:
+	.size	halide_default_trace, .Lfunc_end87-halide_default_trace
+	.section	.rodata.halide_default_trace,"a",@progbits
+	.p2align	2, 0x0
+.LJTI87_0:
+	.long	.LBB87_79-.LJTI87_0
+	.long	.LBB87_71-.LJTI87_0
+	.long	.LBB87_62-.LJTI87_0
+	.long	.LBB87_61-.LJTI87_0
+.LJTI87_1:
+	.long	.LBB87_102-.LJTI87_1
+	.long	.LBB87_110-.LJTI87_1
+	.long	.LBB87_118-.LJTI87_1
+	.long	.LBB87_127-.LJTI87_1
+                                        # -- End function
+	.section	.text.halide_get_trace_file,"ax",@progbits
+	.weak	halide_get_trace_file           # -- Begin function halide_get_trace_file
+	.p2align	4, 0x90
+	.type	halide_get_trace_file,@function
+halide_get_trace_file:                  # @halide_get_trace_file
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movq	_ZN6Halide7Runtime8Internal22halide_trace_file_lockE@GOTPCREL(%rip), %r15
+	.p2align	4, 0x90
+.LBB88_1:                               # %while.cond.i
+                                        # =>This Inner Loop Header: Depth=1
+	movb	$1, %al
+	xchgb	%al, (%r15)
+	testb	%al, %al
+	jne	.LBB88_1
+# %bb.2:                                # %_ZN6Halide7Runtime8Internal14ScopedSpinLockC2EPVc.exit
+	movq	_ZN6Halide7Runtime8Internal17halide_trace_fileE@GOTPCREL(%rip), %r12
+	cmpl	$0, (%r12)
+	js	.LBB88_3
+.LBB88_9:                               # %if.end11
+	movl	(%r12), %eax
+	movb	$0, (%r15)
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB88_3:                               # %if.then
+	leaq	.L.str.28(%rip), %rdi
+	callq	getenv@PLT
+	testq	%rax, %rax
+	je	.LBB88_8
+# %bb.4:                                # %if.then1
+	leaq	.L.str.29(%rip), %rsi
+	movq	%rax, %rdi
+	callq	fopen64@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	jne	.LBB88_6
+# %bb.5:                                # %if.then4
+	leaq	.L.str.30(%rip), %rsi
+	movq	%rbx, %rdi
+	callq	halide_print@PLT
+	callq	abort@PLT
+.LBB88_6:                               # %do.end
+	movq	%r14, %rdi
+	callq	fileno@PLT
+	movl	%eax, %edi
+	callq	halide_set_trace_file@PLT
+	movq	_ZN6Halide7Runtime8Internal35halide_trace_file_internally_openedE@GOTPCREL(%rip), %rax
+	movq	%r14, (%rax)
+	movq	_ZN6Halide7Runtime8Internal19halide_trace_bufferE@GOTPCREL(%rip), %rbx
+	cmpq	$0, (%rbx)
+	jne	.LBB88_9
+# %bb.7:                                # %if.then7
+	movl	$1048588, %edi                  # imm = 0x10000C
+	callq	malloc@PLT
+	movq	%rax, (%rbx)
+	movq	$0, 4(%rax)
+	xorl	%ecx, %ecx
+	xchgl	%ecx, (%rax)
+	jmp	.LBB88_9
+.LBB88_8:                               # %if.else
+	xorl	%edi, %edi
+	callq	halide_set_trace_file@PLT
+	jmp	.LBB88_9
+.Lfunc_end88:
+	.size	halide_get_trace_file, .Lfunc_end88-halide_get_trace_file
+                                        # -- End function
+	.section	.text.halide_set_trace_file,"ax",@progbits
+	.weak	halide_set_trace_file           # -- Begin function halide_set_trace_file
+	.p2align	4, 0x90
+	.type	halide_set_trace_file,@function
+halide_set_trace_file:                  # @halide_set_trace_file
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal17halide_trace_fileE@GOTPCREL(%rip), %rax
+	movl	%edi, (%rax)
+	popq	%rbp
+	retq
+.Lfunc_end89:
+	.size	halide_set_trace_file, .Lfunc_end89-halide_set_trace_file
+                                        # -- End function
+	.section	.text.halide_trace_cleanup,"ax",@progbits
+	.weak	halide_trace_cleanup            # -- Begin function halide_trace_cleanup
+	.p2align	4, 0x90
+	.type	halide_trace_cleanup,@function
+halide_trace_cleanup:                   # @halide_trace_cleanup
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	jmp	halide_shutdown_trace@PLT       # TAILCALL
+.Lfunc_end90:
+	.size	halide_trace_cleanup, .Lfunc_end90-halide_trace_cleanup
+                                        # -- End function
+	.section	.text.halide_shutdown_trace,"ax",@progbits
+	.weak	halide_shutdown_trace           # -- Begin function halide_shutdown_trace
+	.p2align	4, 0x90
+	.type	halide_shutdown_trace,@function
+halide_shutdown_trace:                  # @halide_shutdown_trace
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	_ZN6Halide7Runtime8Internal35halide_trace_file_internally_openedE@GOTPCREL(%rip), %r14
+	movq	(%r14), %rdi
+	testq	%rdi, %rdi
+	je	.LBB91_4
+# %bb.1:                                # %if.then
+	callq	fclose@PLT
+	movl	%eax, %ebx
+	movq	_ZN6Halide7Runtime8Internal17halide_trace_fileE@GOTPCREL(%rip), %rax
+	movl	$0, (%rax)
+	movq	_ZN6Halide7Runtime8Internal29halide_trace_file_initializedE@GOTPCREL(%rip), %rax
+	movb	$0, (%rax)
+	movq	$0, (%r14)
+	movq	_ZN6Halide7Runtime8Internal19halide_trace_bufferE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rdi
+	testq	%rdi, %rdi
+	je	.LBB91_3
+# %bb.2:                                # %if.then2
+	callq	free@PLT
+.LBB91_3:                               # %if.end
+	movl	$-30, %eax
+	testl	%ebx, %ebx
+	je	.LBB91_4
+# %bb.5:                                # %return
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.LBB91_4:                               # %if.end5
+	xorl	%eax, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end91:
+	.size	halide_shutdown_trace, .Lfunc_end91-halide_shutdown_trace
+                                        # -- End function
+	.section	.text.halide_set_custom_trace,"ax",@progbits
+	.weak	halide_set_custom_trace         # -- Begin function halide_set_custom_trace
+	.p2align	4, 0x90
+	.type	halide_set_custom_trace,@function
+halide_set_custom_trace:                # @halide_set_custom_trace
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal19halide_custom_traceE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end92:
+	.size	halide_set_custom_trace, .Lfunc_end92-halide_set_custom_trace
+                                        # -- End function
+	.section	.text.halide_trace,"ax",@progbits
+	.weak	halide_trace                    # -- Begin function halide_trace
+	.p2align	4, 0x90
+	.type	halide_trace,@function
+halide_trace:                           # @halide_trace
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal19halide_custom_traceE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end93:
+	.size	halide_trace, .Lfunc_end93-halide_trace
+                                        # -- End function
+	.section	.text.halide_trace_helper,"ax",@progbits
+	.weak	halide_trace_helper             # -- Begin function halide_trace_helper
+	.p2align	4, 0x90
+	.type	halide_trace_helper,@function
+halide_trace_helper:                    # @halide_trace_helper
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$56, %rsp
+	movl	%r9d, %r13d
+	movq	%rcx, %r14
+	movq	%rdx, %r12
+	movq	%rdi, %rbx
+	movslq	48(%rbp), %r15
+	movl	40(%rbp), %eax
+	movl	32(%rbp), %ecx
+	movl	24(%rbp), %edx
+	movl	16(%rbp), %r9d
+	movq	56(%rbp), %rdi
+	movq	%rsi, -96(%rbp)
+	movq	%r12, -88(%rbp)
+	movq	%r14, -80(%rbp)
+	movq	%rdi, -72(%rbp)
+	movb	%r8b, -64(%rbp)
+	movb	%r13b, -63(%rbp)
+	movw	%r9w, -62(%rbp)
+	movl	%edx, -60(%rbp)
+	movl	%ecx, -56(%rbp)
+	movl	%eax, -52(%rbp)
+	movl	%r15d, -48(%rbp)
+	leaq	-96(%rbp), %rsi
+	movl	$56, %edx
+	movq	%rbx, %rdi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	leal	7(%r13), %eax
+	addl	$14, %r13d
+	testl	%eax, %eax
+	cmovnsl	%eax, %r13d
+	sarl	$3, %r13d
+	imull	16(%rbp), %r13d
+	movslq	%r13d, %rdx
+	movq	%rbx, %rdi
+	movq	%r12, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	shlq	$2, %r15
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%r15, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%rbx, %rdi
+	leaq	-96(%rbp), %rsi
+	callq	halide_trace@PLT
+	addq	$56, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end94:
+	.size	halide_trace_helper, .Lfunc_end94-halide_trace_helper
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal9ends_withEPKcS3_,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal9ends_withEPKcS3_ # -- Begin function _ZN6Halide7Runtime8Internal9ends_withEPKcS3_
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal9ends_withEPKcS3_,@function
+_ZN6Halide7Runtime8Internal9ends_withEPKcS3_: # @_ZN6Halide7Runtime8Internal9ends_withEPKcS3_
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	xorl	%eax, %eax
+	.p2align	4, 0x90
+.LBB95_1:                               # %while.cond
+                                        # =>This Inner Loop Header: Depth=1
+	cmpb	$0, (%rdi,%rax)
+	leaq	1(%rax), %rax
+	jne	.LBB95_1
+# %bb.2:                                # %while.cond1.preheader
+	movq	$-1, %rcx
+	.p2align	4, 0x90
+.LBB95_3:                               # %while.cond1
+                                        # =>This Inner Loop Header: Depth=1
+	cmpb	$0, 1(%rsi,%rcx)
+	leaq	1(%rcx), %rcx
+	jne	.LBB95_3
+# %bb.4:                                # %while.cond6.preheader
+	xorl	%edx, %edx
+	cmpq	$1, %rax
+	je	.LBB95_11
+# %bb.5:                                # %while.cond6.preheader
+	movl	$0, %r8d
+	testq	%rcx, %rcx
+	je	.LBB95_10
+	.p2align	4, 0x90
+.LBB95_6:                               # %while.body8
+                                        # =>This Inner Loop Header: Depth=1
+	movzbl	-1(%rdi,%rax), %edx
+	cmpb	(%rsi,%rcx), %dl
+	jne	.LBB95_12
+# %bb.7:                                # %if.end
+                                        #   in Loop: Header=BB95_6 Depth=1
+	leaq	-1(%rcx), %rdx
+	leaq	-1(%rax), %r8
+	cmpq	$1, %rcx
+	je	.LBB95_9
+# %bb.8:                                # %if.end
+                                        #   in Loop: Header=BB95_6 Depth=1
+	movq	%rdx, %rcx
+	cmpq	$2, %rax
+	movq	%r8, %rax
+	jne	.LBB95_6
+.LBB95_9:                               # %while.end13.loopexit
+	movzbl	-1(%rdi,%r8), %r8d
+	movzbl	(%rsi,%rdx), %edx
+.LBB95_10:                              # %while.end13
+	cmpb	%dl, %r8b
+	sete	%al
+                                        # kill: def $al killed $al killed $eax
+	popq	%rbp
+	retq
+.LBB95_11:
+	xorl	%r8d, %r8d
+	cmpb	%dl, %r8b
+	sete	%al
+                                        # kill: def $al killed $al killed $eax
+	popq	%rbp
+	retq
+.LBB95_12:
+	xorl	%eax, %eax
+                                        # kill: def $al killed $al killed $eax
+	popq	%rbp
+	retq
+.Lfunc_end95:
+	.size	_ZN6Halide7Runtime8Internal9ends_withEPKcS3_, .Lfunc_end95-_ZN6Halide7Runtime8Internal9ends_withEPKcS3_
+                                        # -- End function
+	.section	.rodata.cst32,"aM",@progbits,32
+	.p2align	5, 0x0                          # -- Begin function halide_debug_to_file
+.LCPI96_0:
+	.long	327962                          # 0x5011a
+	.long	1                               # 0x1
+	.long	194                             # 0xc2
+	.long	327963                          # 0x5011b
+	.long	1                               # 0x1
+	.long	202                             # 0xca
+	.long	196892                          # 0x3011c
+	.long	1                               # 0x1
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4, 0x0
+.LCPI96_1:
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	1                               # 0x1
+	.long	1                               # 0x1
+	.section	.text.halide_debug_to_file,"ax",@progbits
+	.weak	halide_debug_to_file
+	.p2align	4, 0x90
+	.type	halide_debug_to_file,@function
+halide_debug_to_file:                   # @halide_debug_to_file
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$4440, %rsp                     # imm = 0x1158
+	movq	%rcx, %r14
+	movl	%edx, %r12d
+	movq	%rsi, %rbx
+	movq	%rdi, %r13
+	cmpq	$0, 16(%rcx)
+	jne	.LBB96_3
+# %bb.1:                                # %entry
+	cmpq	$0, (%r14)
+	jne	.LBB96_3
+# %bb.2:                                # %if.then
+	leaq	.L.str.34(%rip), %rsi
+	movq	%r13, %rdi
+	callq	halide_error@PLT
+	movl	$-34, %r15d
+	jmp	.LBB96_87
+.LBB96_3:                               # %if.end
+	cmpl	$5, 36(%r14)
+	jl	.LBB96_5
+# %bb.4:                                # %if.then1
+	leaq	.L.str.1.35(%rip), %rsi
+	movq	%r13, %rdi
+	callq	halide_error@PLT
+	movl	$-43, %r15d
+	jmp	.LBB96_87
+.LBB96_5:                               # %if.end2
+	movq	%r13, %rdi
+	movq	%r14, %rsi
+	callq	halide_copy_to_host@PLT
+	movl	%eax, %r15d
+	testl	%eax, %eax
+	jne	.LBB96_87
+# %bb.6:                                # %cleanup.cont
+	leaq	.L.str.2.36(%rip), %rsi
+	movq	%rbx, %rdi
+	callq	fopen64@PLT
+	movq	%rax, -56(%rbp)                 # 8-byte Spill
+	testq	%rax, %rax
+	je	.LBB96_11
+# %bb.7:                                # %if.end9
+	vxorps	%xmm0, %xmm0, %xmm0
+	vmovups	%ymm0, -192(%rbp)
+	vmovups	%ymm0, -160(%rbp)
+	movslq	36(%r14), %rax
+	testq	%rax, %rax
+	jle	.LBB96_12
+# %bb.8:                                # %for.body.lr.ph
+	movq	40(%r14), %rcx
+	leal	-1(%rax), %esi
+	cmpl	$3, %esi
+	movl	$3, %edx
+	cmovbl	%esi, %edx
+	shlq	$4, %rdx
+	addq	$16, %rdx
+	movl	$1, %r15d
+	leaq	-188(%rbp), %rsi
+	xorl	%edi, %edi
+	.p2align	4, 0x90
+.LBB96_9:                               # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	vmovups	(%rcx,%rdi), %xmm0
+	vmovups	%xmm0, -4(%rsi,%rdi)
+	imull	-188(%rbp,%rdi), %r15d
+	addq	$16, %rdi
+	cmpq	%rdi, %rdx
+	jne	.LBB96_9
+# %bb.10:                               # %for.cond20.preheader
+	cmpl	$3, %eax
+	jle	.LBB96_13
+	jmp	.LBB96_15
+.LBB96_11:
+	movl	$-13, %r15d
+	jmp	.LBB96_87
+.LBB96_12:
+	movl	$1, %r15d
+.LBB96_13:                              # %for.body23.preheader
+	leaq	1(%rax), %rcx
+	shlq	$4, %rax
+	leaq	-192(%rbp), %rdx
+	addq	%rdx, %rax
+	addq	$8, %rax
+	movabsq	$4294967296, %rdx               # imm = 0x100000000
+	movq	%rcx, %rsi
+	.p2align	4, 0x90
+.LBB96_14:                              # %for.body23
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rdx, -8(%rax)
+	movl	$0, (%rax)
+	incq	%rsi
+	addq	$16, %rax
+	cmpl	$4, %ecx
+	movq	%rsi, %rcx
+	jne	.LBB96_14
+.LBB96_15:                              # %for.cond.cleanup22
+	movzbl	33(%r14), %eax
+	addq	$7, %rax
+	shrq	$3, %rax
+	movq	%rax, -72(%rbp)                 # 8-byte Spill
+	leaq	.L.str.3.37(%rip), %rsi
+	movq	%rbx, %rdi
+	vzeroupper
+	callq	_ZN6Halide7Runtime8Internal9ends_withEPKcS3_@PLT
+	testb	%al, %al
+	jne	.LBB96_17
+# %bb.16:                               # %lor.lhs.false
+	leaq	.L.str.4.38(%rip), %rsi
+	movq	%rbx, %rdi
+	callq	_ZN6Halide7Runtime8Internal9ends_withEPKcS3_@PLT
+	testb	%al, %al
+	je	.LBB96_27
+.LBB96_17:                              # %if.then38
+	movl	-188(%rbp), %edx
+	movl	-172(%rbp), %esi
+	movl	-140(%rbp), %r8d
+	cmpl	$2, %r8d
+	setb	%al
+	movl	-156(%rbp), %edi
+	cmpl	$5, %edi
+	setl	%cl
+	testb	%cl, %al
+	movl	$1, %ebx
+	cmovel	%edi, %ebx
+	movl	%r8d, -60(%rbp)                 # 4-byte Spill
+	movl	%r8d, %r13d
+	movl	%edi, -64(%rbp)                 # 4-byte Spill
+	cmovnel	%edi, %r13d
+	movabsq	$34362509641, %rax              # imm = 0x8002A4949
+	movq	%rax, -4480(%rbp)
+	movl	$16777231, -4472(%rbp)          # imm = 0x100000F
+	movw	$4, -4468(%rbp)
+	movl	$1, -4466(%rbp)
+	movl	%edx, -48(%rbp)                 # 4-byte Spill
+	movl	%edx, -4462(%rbp)
+	movabsq	$4295229697, %rax               # imm = 0x100040101
+	movq	%rax, -4458(%rbp)
+	movl	%esi, -4450(%rbp)
+	movq	-72(%rbp), %rdx                 # 8-byte Reload
+	leal	(,%rdx,8), %eax
+	movabsq	$4295164162, %rcx               # imm = 0x100030102
+	movq	%rcx, -4446(%rbp)
+	movw	%ax, -4438(%rbp)
+	movabsq	$4295164163, %rax               # imm = 0x100030103
+	movq	%rax, -4434(%rbp)
+	movw	$1, -4426(%rbp)
+	xorl	%eax, %eax
+	cmpl	$3, %r13d
+	setge	%al
+	incl	%eax
+	movabsq	$4295164166, %rcx               # imm = 0x100030106
+	movq	%rcx, -4422(%rbp)
+	movw	%ax, -4414(%rbp)
+	movl	$262417, -4410(%rbp)            # imm = 0x40111
+	movl	%r13d, -4406(%rbp)
+	movabsq	$845614636073170, %rax          # imm = 0x30115000000D2
+	movq	%rax, -4402(%rbp)
+	movl	$1, -4394(%rbp)
+	movw	%r13w, -4390(%rbp)
+	movabsq	$4295229718, %rax               # imm = 0x100040116
+	movq	%rax, -4386(%rbp)
+	imull	%edx, %r15d
+	cmpl	$1, %r13d
+	leal	210(,%r13,4), %eax
+	cmovel	%r15d, %eax
+	movl	%esi, -44(%rbp)                 # 4-byte Spill
+	movl	%esi, -4378(%rbp)
+	movl	$262423, -4374(%rbp)            # imm = 0x40117
+	movl	%r13d, -4370(%rbp)
+	movl	%eax, -4366(%rbp)
+	vmovaps	.LCPI96_0(%rip), %ymm0          # ymm0 = [327962,1,194,327963,1,202,196892,1]
+	vmovups	%ymm0, -4362(%rbp)
+	movw	$2, -4330(%rbp)
+	movabsq	$4295164200, %rax               # imm = 0x100030128
+	movq	%rax, -4326(%rbp)
+	movw	$1, -4318(%rbp)
+	movslq	%r12d, %rax
+	movq	_ZN6Halide7Runtime8Internal30pixel_type_to_tiff_sample_typeE@GOTPCREL(%rip), %rcx
+	movzwl	(%rcx,%rax,2), %eax
+	movabsq	$4295164243, %rcx               # imm = 0x100030153
+	movq	%rcx, -4314(%rbp)
+	movw	%ax, -4306(%rbp)
+	movabsq	$4295262437, %rax               # imm = 0x1000480E5
+	movq	%rax, -4302(%rbp)
+	movl	%ebx, -4294(%rbp)
+	vmovaps	.LCPI96_1(%rip), %xmm0          # xmm0 = [0,1,1,1]
+	vmovups	%xmm0, -4290(%rbp)
+	movl	$1, -4274(%rbp)
+	leaq	-4480(%rbp), %rdi
+	movl	$210, %esi
+	movl	$1, %edx
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+	movq	%r12, %rcx
+	vzeroupper
+	callq	fwrite@PLT
+	testq	%rax, %rax
+	je	.LBB96_26
+# %bb.18:                               # %if.end105
+	cmpl	$2, %r13d
+	jl	.LBB96_25
+# %bb.19:                               # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit625.lr.ph
+	leal	210(,%r13,8), %eax
+	movl	%eax, -384(%rbp)
+	movl	-48(%rbp), %r15d                # 4-byte Reload
+	imull	-72(%rbp), %r15d                # 4-byte Folded Reload
+	imull	-44(%rbp), %r15d                # 4-byte Folded Reload
+	imull	%ebx, %r15d
+	leaq	-384(%rbp), %rbx
+	movl	%r13d, %r12d
+	.p2align	4, 0x90
+.LBB96_20:                              # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit625
+                                        # =>This Inner Loop Header: Depth=1
+	movl	$4, %esi
+	movl	$1, %edx
+	movq	%rbx, %rdi
+	movq	-56(%rbp), %rcx                 # 8-byte Reload
+	callq	fwrite@PLT
+	testq	%rax, %rax
+	je	.LBB96_57
+# %bb.21:                               # %if.end120
+                                        #   in Loop: Header=BB96_20 Depth=1
+	addl	%r15d, -384(%rbp)
+	decl	%r12d
+	jne	.LBB96_20
+# %bb.22:                               # %for.end133
+	movl	%r15d, -240(%rbp)
+	leaq	-240(%rbp), %rbx
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+.LBB96_23:                              # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit631
+                                        # =>This Inner Loop Header: Depth=1
+	movl	$4, %esi
+	movl	$1, %edx
+	movq	%rbx, %rdi
+	movq	%r12, %rcx
+	callq	fwrite@PLT
+	testq	%rax, %rax
+	je	.LBB96_26
+# %bb.24:                               # %for.cond142
+                                        #   in Loop: Header=BB96_23 Depth=1
+	decl	%r13d
+	jne	.LBB96_23
+.LBB96_25:                              # %cleanup159
+	xorl	%ecx, %ecx
+	movl	-64(%rbp), %ebx                 # 4-byte Reload
+	movl	-60(%rbp), %r15d                # 4-byte Reload
+	jmp	.LBB96_36
+.LBB96_27:                              # %if.else169
+	leaq	.L.str.5.39(%rip), %rsi
+	movq	%rbx, %rdi
+	callq	_ZN6Halide7Runtime8Internal9ends_withEPKcS3_@PLT
+	testb	%al, %al
+	je	.LBB96_35
+# %bb.28:                               # %while.cond.preheader
+	xorl	%ecx, %ecx
+	movq	%rbx, %rdx
+	.p2align	4, 0x90
+.LBB96_29:                              # %while.cond
+                                        # =>This Inner Loop Header: Depth=1
+	decq	%rcx
+	cmpb	$0, (%rdx)
+	leaq	1(%rdx), %rdx
+	jne	.LBB96_29
+# %bb.30:                               # %while.body176.preheader
+	subq	%rcx, %rbx
+	incq	%rcx
+	xorl	%eax, %eax
+	.p2align	4, 0x90
+.LBB96_31:                              # %while.body176
+                                        # =>This Inner Loop Header: Depth=1
+	incq	%rcx
+	cmpb	$46, -2(%rdx,%rax)
+	leaq	-1(%rax), %rax
+	jne	.LBB96_31
+# %bb.32:                               # %while.cond179.preheader
+	leaq	-2(%rbx), %rsi
+	leaq	-1(%rcx), %rdx
+	xorl	%r15d, %r15d
+.LBB96_33:                              # %while.cond179
+                                        # =>This Inner Loop Header: Depth=1
+	cmpq	%r15, %rcx
+	je	.LBB96_58
+# %bb.34:                               # %land.rhs181
+                                        #   in Loop: Header=BB96_33 Depth=1
+	leaq	(%rsi,%r15), %rdi
+	decq	%r15
+	cmpb	$47, (%rax,%rdi)
+	jne	.LBB96_33
+	jmp	.LBB96_59
+.LBB96_35:                              # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit673
+	movl	-188(%rbp), %eax
+	movl	-172(%rbp), %ecx
+	movl	%eax, -48(%rbp)                 # 4-byte Spill
+	movl	%eax, -4480(%rbp)
+	movl	%ecx, -44(%rbp)                 # 4-byte Spill
+	movl	%ecx, -4476(%rbp)
+	movl	-156(%rbp), %ebx
+	movl	%ebx, -4472(%rbp)
+	movl	-140(%rbp), %r15d
+	movl	%r15d, -4468(%rbp)
+	movl	%r12d, -4464(%rbp)
+	leaq	-4480(%rbp), %rdi
+	movl	$20, %esi
+	movl	$1, %edx
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+	movq	%r12, %rcx
+	callq	fwrite@PLT
+	xorl	%ecx, %ecx
+	testq	%rax, %rax
+	je	.LBB96_26
+.LBB96_36:                              # %if.end316
+	movl	$4096, %eax                     # imm = 0x1000
+	xorl	%edx, %edx
+	divl	-72(%rbp)                       # 4-byte Folded Reload
+	movl	%eax, -96(%rbp)                 # 4-byte Spill
+	testl	%r15d, %r15d
+	jle	.LBB96_54
+# %bb.37:                               # %for.body327.lr.ph
+	testl	%ebx, %ebx
+	jle	.LBB96_54
+# %bb.38:                               # %for.body327.lr.ph
+	cmpl	$0, -44(%rbp)                   # 4-byte Folded Reload
+	jle	.LBB96_54
+# %bb.39:                               # %for.body327.lr.ph
+	cmpl	$0, -48(%rbp)                   # 4-byte Folded Reload
+	jle	.LBB96_54
+# %bb.40:                               # %for.body327.us.us.us.preheader
+	movq	%rcx, -88(%rbp)                 # 8-byte Spill
+	movl	-144(%rbp), %eax
+	movl	%eax, -80(%rbp)                 # 4-byte Spill
+	addl	%eax, %r15d
+	movl	%r15d, -60(%rbp)                # 4-byte Spill
+	movl	-160(%rbp), %r12d
+	addl	%r12d, %ebx
+	movl	%ebx, -64(%rbp)                 # 4-byte Spill
+	movl	-192(%rbp), %eax
+	movl	-176(%rbp), %ebx
+	addl	%ebx, -44(%rbp)                 # 4-byte Folded Spill
+	movl	%eax, -108(%rbp)                # 4-byte Spill
+	addl	%eax, -48(%rbp)                 # 4-byte Folded Spill
+	movl	-96(%rbp), %eax                 # 4-byte Reload
+                                        # kill: def $eax killed $eax def $rax
+	imull	-72(%rbp), %eax                 # 4-byte Folded Reload
+	movq	%rax, -248(%rbp)                # 8-byte Spill
+	xorl	%r13d, %r13d
+	movl	%r12d, -100(%rbp)               # 4-byte Spill
+	movl	%ebx, -104(%rbp)                # 4-byte Spill
+	jmp	.LBB96_42
+.LBB96_41:                              # %for.inc394.us.us.us.us.us.us
+                                        #   in Loop: Header=BB96_42 Depth=1
+	incl	%ebx
+	cmpl	-44(%rbp), %ebx                 # 4-byte Folded Reload
+	jge	.LBB96_50
+.LBB96_42:                              # %for.body349.us.us.us.us.us.us
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB96_44 Depth 2
+                                        #       Child Loop BB96_46 Depth 3
+	movl	-108(%rbp), %r15d               # 4-byte Reload
+	movl	%r13d, %eax
+	jmp	.LBB96_44
+	.p2align	4, 0x90
+.LBB96_43:                              # %for.inc389.us.us.us.us.us.us
+                                        #   in Loop: Header=BB96_44 Depth=2
+	incl	%r15d
+	movl	%r13d, %eax
+	cmpl	-48(%rbp), %r15d                # 4-byte Folded Reload
+	jge	.LBB96_41
+.LBB96_44:                              # %for.body360.us.us.us.us.us.us
+                                        #   Parent Loop BB96_42 Depth=1
+                                        # =>  This Loop Header: Depth=2
+                                        #       Child Loop BB96_46 Depth 3
+	leal	1(%rax), %r13d
+	movl	%r15d, -384(%rbp)
+	movl	%ebx, -380(%rbp)
+	movl	%r12d, -376(%rbp)
+	movl	-80(%rbp), %ecx                 # 4-byte Reload
+	movl	%ecx, -372(%rbp)
+	movl	36(%r14), %ecx
+	testl	%ecx, %ecx
+	jle	.LBB96_47
+# %bb.45:                               # %for.body.lr.ph.i.us.us.us.us.us.us
+                                        #   in Loop: Header=BB96_44 Depth=2
+	movq	40(%r14), %rsi
+	shlq	$2, %rcx
+	xorl	%edi, %edi
+	xorl	%edx, %edx
+	.p2align	4, 0x90
+.LBB96_46:                              # %for.body.i.us.us.us.us.us.us
+                                        #   Parent Loop BB96_42 Depth=1
+                                        #     Parent Loop BB96_44 Depth=2
+                                        # =>    This Inner Loop Header: Depth=3
+	movslq	8(%rsi,%rdi,4), %r8
+	movslq	-384(%rbp,%rdi), %r9
+	movslq	(%rsi,%rdi,4), %r10
+	subq	%r10, %r9
+	imulq	%r8, %r9
+	addq	%r9, %rdx
+	addq	$4, %rdi
+	cmpq	%rdi, %rcx
+	jne	.LBB96_46
+	jmp	.LBB96_48
+.LBB96_47:                              #   in Loop: Header=BB96_44 Depth=2
+	xorl	%edx, %edx
+.LBB96_48:                              # %_ZNK15halide_buffer_t10address_ofEPKi.exit.us.us.us.us.us.us
+                                        #   in Loop: Header=BB96_44 Depth=2
+	movzbl	33(%r14), %esi
+	addq	$7, %rsi
+	shrq	$3, %rsi
+	imulq	%rdx, %rsi
+	addq	16(%r14), %rsi
+	movq	-72(%rbp), %rdx                 # 8-byte Reload
+	imull	%edx, %eax
+	cltq
+	leaq	(%rax,%rbp), %rdi
+	addq	$-4480, %rdi                    # imm = 0xEE80
+	callq	memcpy@PLT
+	cmpl	-96(%rbp), %r13d                # 4-byte Folded Reload
+	jne	.LBB96_43
+# %bb.49:                               # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit689.us.us.us.us.us.us
+                                        #   in Loop: Header=BB96_44 Depth=2
+	movl	$1, %edx
+	leaq	-4480(%rbp), %rdi
+	movq	-248(%rbp), %rsi                # 8-byte Reload
+	movq	-56(%rbp), %rcx                 # 8-byte Reload
+	callq	fwrite@PLT
+	xorl	%r13d, %r13d
+	testq	%rax, %rax
+	jne	.LBB96_43
+	jmp	.LBB96_85
+.LBB96_50:                              # %for.inc399.us.us.us.us.us
+                                        #   in Loop: Header=BB96_42 Depth=1
+	incl	%r12d
+	cmpl	-64(%rbp), %r12d                # 4-byte Folded Reload
+	movl	-104(%rbp), %ebx                # 4-byte Reload
+	jl	.LBB96_42
+# %bb.51:                               # %for.inc404.us.us.us
+                                        #   in Loop: Header=BB96_42 Depth=1
+	movl	-80(%rbp), %eax                 # 4-byte Reload
+	incl	%eax
+	movl	%eax, -80(%rbp)                 # 4-byte Spill
+	cmpl	-60(%rbp), %eax                 # 4-byte Folded Reload
+	movl	-100(%rbp), %r12d               # 4-byte Reload
+	jl	.LBB96_42
+# %bb.52:                               # %for.end408
+	testl	%r13d, %r13d
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+	movq	-88(%rbp), %rcx                 # 8-byte Reload
+	jle	.LBB96_54
+# %bb.53:                               # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit695
+	imull	-72(%rbp), %r13d                # 4-byte Folded Reload
+	movslq	%r13d, %rsi
+	leaq	-4480(%rbp), %rdi
+	movl	$1, %edx
+	movq	%r12, %rcx
+	callq	fwrite@PLT
+	movq	-88(%rbp), %rcx                 # 8-byte Reload
+	testq	%rax, %rax
+	je	.LBB96_26
+.LBB96_54:                              # %if.end417
+	movq	$0, -384(%rbp)
+	testl	%ecx, %ecx
+	je	.LBB96_56
+# %bb.55:                               # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit701
+	movl	%ecx, %esi
+	leaq	-384(%rbp), %rdi
+	movl	$1, %edx
+	movq	%r12, %rcx
+	callq	fwrite@PLT
+	testq	%rax, %rax
+	je	.LBB96_26
+.LBB96_56:                              # %if.end428
+	xorl	%r15d, %r15d
+	jmp	.LBB96_86
+.LBB96_57:                              # %cleanup155.thread
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+	movl	$-13, %r15d
+	jmp	.LBB96_86
+.LBB96_58:
+	movq	%rdx, %r15
+.LBB96_59:                              # %while.end188
+	cmpq	$-1, %r15
+	je	.LBB96_65
+# %bb.60:                               # %while.body192.preheader
+	movq	%r15, %rdx
+	notq	%rdx
+	addq	%r15, %rbx
+	xorl	%ecx, %ecx
+.LBB96_61:                              # %while.body192
+                                        # =>This Inner Loop Header: Depth=1
+	leaq	(%rbx,%rcx), %rsi
+	movzbl	(%rax,%rsi), %esi
+	movb	%sil, -4480(%rbp,%rcx)
+	incq	%rcx
+	cmpq	%rcx, %rdx
+	jne	.LBB96_61
+# %bb.62:                               # %while.cond196.preheader
+	leaq	-1(%rcx), %rax
+	cmpq	$255, %rax
+	jb	.LBB96_66
+	jmp	.LBB96_67
+.LBB96_65:
+	xorl	%ecx, %ecx
+.LBB96_66:                              # %while.body199
+                                        # =>This Inner Loop Header: Depth=1
+	movb	$0, -4480(%rbp,%rcx)
+	incq	%rcx
+	cmpq	$256, %rcx                      # imm = 0x100
+	jne	.LBB96_66
+.LBB96_67:                              # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit637
+	vmovups	.L__const.halide_debug_to_file.header+96(%rip), %ymm0
+	vmovups	%ymm0, -288(%rbp)
+	vmovups	.L__const.halide_debug_to_file.header+64(%rip), %ymm0
+	vmovups	%ymm0, -320(%rbp)
+	vmovups	.L__const.halide_debug_to_file.header+32(%rip), %ymm0
+	vmovups	%ymm0, -352(%rbp)
+	vmovups	.L__const.halide_debug_to_file.header(%rip), %ymm0
+	vmovups	%ymm0, -384(%rbp)
+	movb	$0, -256(%rbp)
+	leaq	-384(%rbp), %rdi
+	movl	$1, %ebx
+	movl	$128, %esi
+	movl	$1, %edx
+	movq	-56(%rbp), %rcx                 # 8-byte Reload
+	vzeroupper
+	callq	fwrite@PLT
+	movl	36(%r14), %eax
+	testl	%eax, %eax
+	jle	.LBB96_77
+# %bb.68:                               # %for.body.lr.ph.i.i
+	movq	40(%r14), %rcx
+	movq	%rax, %rdx
+	shlq	$4, %rdx
+	xorl	%esi, %esi
+	xorl	%ebx, %ebx
+	jmp	.LBB96_70
+.LBB96_69:                              # %if.end.i.i
+                                        #   in Loop: Header=BB96_70 Depth=1
+	addq	$16, %rsi
+	cmpq	%rsi, %rdx
+	je	.LBB96_72
+.LBB96_70:                              # %for.body.i.i
+                                        # =>This Inner Loop Header: Depth=1
+	movl	8(%rcx,%rsi), %edi
+	testl	%edi, %edi
+	jle	.LBB96_69
+# %bb.71:                               # %if.then.i.i
+                                        #   in Loop: Header=BB96_70 Depth=1
+	movslq	4(%rcx,%rsi), %r8
+	decq	%r8
+	imulq	%rdi, %r8
+	addq	%r8, %rbx
+	jmp	.LBB96_69
+.LBB96_72:                              # %for.body.i12.i.preheader
+	xorl	%esi, %esi
+	xorl	%edi, %edi
+	jmp	.LBB96_74
+.LBB96_73:                              # %if.end.i23.i
+                                        #   in Loop: Header=BB96_74 Depth=1
+	addq	$16, %rsi
+	cmpq	%rsi, %rdx
+	je	.LBB96_76
+.LBB96_74:                              # %for.body.i12.i
+                                        # =>This Inner Loop Header: Depth=1
+	movslq	8(%rcx,%rsi), %r8
+	testq	%r8, %r8
+	jns	.LBB96_73
+# %bb.75:                               # %if.then.i19.i
+                                        #   in Loop: Header=BB96_74 Depth=1
+	movslq	4(%rcx,%rsi), %r9
+	decq	%r9
+	imulq	%r8, %r9
+	addq	%r9, %rdi
+	jmp	.LBB96_73
+.LBB96_76:                              # %_ZNK15halide_buffer_t12begin_offsetEv.exit.loopexit.i
+	subq	%rdi, %rbx
+	incq	%rbx
+.LBB96_77:                              # %_ZNK15halide_buffer_t13size_in_bytesEv.exit
+	movzbl	33(%r14), %esi
+	addq	$7, %rsi
+	shrq	$3, %rsi
+	imulq	%rbx, %rsi
+	movl	%esi, %edx
+	negl	%edx
+	andl	$7, %edx
+	leaq	(%rdx,%rsi), %rcx
+	shrq	$32, %rcx
+	jne	.LBB96_84
+# %bb.78:                               # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit643
+	movl	$6, %r13d
+	subl	%r15d, %r13d
+	andl	$-8, %r13d
+	cmpl	$3, %eax
+	movl	$2, %ecx
+	cmovgel	%eax, %ecx
+	movl	$14, -240(%rbp)
+	leal	4(,%rcx,4), %ebx
+	movl	%ecx, %eax
+	shll	$2, %eax
+	andl	$-8, %ebx
+	leal	(%rbx,%r13), %ecx
+	movq	%rsi, -96(%rbp)                 # 8-byte Spill
+	addl	%esi, %ecx
+	movq	%rdx, -88(%rbp)                 # 8-byte Spill
+	addl	%edx, %ecx
+	addl	$40, %ecx
+	movl	%ecx, -236(%rbp)
+	movabsq	$34359738374, %rcx              # imm = 0x800000006
+	movq	%rcx, -232(%rbp)
+	movslq	%r12d, %rdx
+	movq	_ZN6Halide7Runtime8Internal31pixel_type_to_matlab_class_codeE@GOTPCREL(%rip), %rcx
+	movq	%rdx, -80(%rbp)                 # 8-byte Spill
+	movzbl	(%rcx,%rdx), %ecx
+	movl	%ecx, -224(%rbp)
+	movabsq	$21474836481, %rcx              # imm = 0x500000001
+	movq	%rcx, -220(%rbp)
+	movl	%eax, -212(%rbp)
+	leaq	-240(%rbp), %rdi
+	movl	$32, %esi
+	movl	$1, %edx
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+	movq	%r12, %rcx
+	callq	fwrite@PLT
+	testq	%rax, %rax
+	je	.LBB96_26
+# %bb.79:                               # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit649
+	movl	-188(%rbp), %eax
+	movl	-172(%rbp), %ecx
+	movl	%eax, -48(%rbp)                 # 4-byte Spill
+	movl	%eax, -208(%rbp)
+	movl	%ecx, -44(%rbp)                 # 4-byte Spill
+	movl	%ecx, -204(%rbp)
+	movl	-156(%rbp), %eax
+	movl	%eax, -64(%rbp)                 # 4-byte Spill
+	movl	%eax, -200(%rbp)
+	movl	-140(%rbp), %eax
+	movl	%eax, -60(%rbp)                 # 4-byte Spill
+	movl	%eax, -196(%rbp)
+	movslq	%ebx, %rsi
+	leaq	-208(%rbp), %rdi
+	movl	$1, %edx
+	movq	%r12, %rcx
+	callq	fwrite@PLT
+	testq	%rax, %rax
+	je	.LBB96_26
+# %bb.80:                               # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit655
+	notl	%r15d
+	movl	$1, -124(%rbp)
+	movl	%r15d, -120(%rbp)
+	leaq	-124(%rbp), %rdi
+	movl	$8, %esi
+	movl	$1, %edx
+	movq	%r12, %rcx
+	callq	fwrite@PLT
+	testq	%rax, %rax
+	je	.LBB96_26
+# %bb.81:                               # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit661
+	movl	%r13d, %esi
+	leaq	-4480(%rbp), %rdi
+	movl	$1, %edx
+	movq	%r12, %rcx
+	callq	fwrite@PLT
+	testq	%rax, %rax
+	je	.LBB96_26
+# %bb.82:                               # %_ZN6Halide7Runtime8Internal10ScopedFile5writeEPKvm.exit667
+	movq	_ZN6Halide7Runtime8Internal30pixel_type_to_matlab_type_codeE@GOTPCREL(%rip), %rax
+	movq	-80(%rbp), %rcx                 # 8-byte Reload
+	movzbl	(%rax,%rcx), %eax
+	movl	%eax, -116(%rbp)
+	movq	-96(%rbp), %rax                 # 8-byte Reload
+	movl	%eax, -112(%rbp)
+	leaq	-116(%rbp), %rdi
+	movl	$8, %esi
+	movl	$1, %edx
+	movq	%r12, %rcx
+	callq	fwrite@PLT
+	testq	%rax, %rax
+	je	.LBB96_26
+# %bb.83:                               # %cleanup283
+	movl	-64(%rbp), %ebx                 # 4-byte Reload
+	movl	-60(%rbp), %r15d                # 4-byte Reload
+	movq	-88(%rbp), %rcx                 # 8-byte Reload
+	jmp	.LBB96_36
+.LBB96_26:                              # %cleanup159.thread
+	movl	$-13, %r15d
+	jmp	.LBB96_86
+.LBB96_84:                              # %cleanup283.thread
+	leaq	.L.str.6.40(%rip), %rsi
+	movq	%r13, %rdi
+	callq	halide_error@PLT
+.LBB96_85:                              # %cleanup406
+	movl	$-13, %r15d
+	movq	-56(%rbp), %r12                 # 8-byte Reload
+.LBB96_86:                              # %cleanup438
+	movq	%r12, %rdi
+	callq	fclose@PLT
+.LBB96_87:                              # %return
+	movl	%r15d, %eax
+	addq	$4440, %rsp                     # imm = 0x1158
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end96:
+	.size	halide_debug_to_file, .Lfunc_end96-halide_debug_to_file
+                                        # -- End function
+	.section	.text.halide_cache_cleanup,"ax",@progbits
+	.weak	halide_cache_cleanup            # -- Begin function halide_cache_cleanup
+	.p2align	4, 0x90
+	.type	halide_cache_cleanup,@function
+halide_cache_cleanup:                   # @halide_cache_cleanup
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	jmp	halide_memoization_cache_cleanup@PLT # TAILCALL
+.Lfunc_end97:
+	.size	halide_cache_cleanup, .Lfunc_end97-halide_cache_cleanup
+                                        # -- End function
+	.section	.text.halide_memoization_cache_cleanup,"ax",@progbits
+	.weak	halide_memoization_cache_cleanup # -- Begin function halide_memoization_cache_cleanup
+	.p2align	4, 0x90
+	.type	halide_memoization_cache_cleanup,@function
+halide_memoization_cache_cleanup:       # @halide_memoization_cache_cleanup
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	_ZN6Halide7Runtime8Internal13cache_entriesE@GOTPCREL(%rip), %r14
+	movl	$2048, %r15d                    # imm = 0x800
+	addq	_ZN6Halide7Runtime8Internal13cache_entriesE@GOTPCREL(%rip), %r15
+	jmp	.LBB98_1
+	.p2align	4, 0x90
+.LBB98_3:                               # %while.end
+                                        #   in Loop: Header=BB98_1 Depth=1
+	addq	$8, %r14
+	cmpq	%r15, %r14
+	je	.LBB98_4
+.LBB98_1:                               # %for.body
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB98_2 Depth 2
+	movq	(%r14), %rbx
+	movq	$0, (%r14)
+	testq	%rbx, %rbx
+	je	.LBB98_3
+	.p2align	4, 0x90
+.LBB98_2:                               # %while.body
+                                        #   Parent Loop BB98_1 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	(%rbx), %r12
+	movq	%rbx, %rdi
+	callq	_ZN6Halide7Runtime8Internal10CacheEntry7destroyEv@PLT
+	xorl	%edi, %edi
+	movq	%rbx, %rsi
+	callq	halide_free@PLT
+	movq	%r12, %rbx
+	testq	%r12, %r12
+	jne	.LBB98_2
+	jmp	.LBB98_3
+.LBB98_4:                               # %for.cond.cleanup
+	movq	_ZN6Halide7Runtime8Internal18current_cache_sizeE@GOTPCREL(%rip), %rax
+	movq	$0, (%rax)
+	movq	_ZN6Halide7Runtime8Internal18most_recently_usedE@GOTPCREL(%rip), %rax
+	movq	$0, (%rax)
+	movq	_ZN6Halide7Runtime8Internal19least_recently_usedE@GOTPCREL(%rip), %rax
+	movq	$0, (%rax)
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end98:
+	.size	halide_memoization_cache_cleanup, .Lfunc_end98-halide_memoization_cache_cleanup
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal10CacheEntry7destroyEv,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal10CacheEntry7destroyEv # -- Begin function _ZN6Halide7Runtime8Internal10CacheEntry7destroyEv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal10CacheEntry7destroyEv,@function
+_ZN6Halide7Runtime8Internal10CacheEntry7destroyEv: # @_ZN6Halide7Runtime8Internal10CacheEntry7destroyEv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movq	%rdi, %rbx
+	cmpl	$0, 56(%rdi)
+	je	.LBB99_3
+# %bb.1:                                # %for.body.lr.ph
+	xorl	%r14d, %r14d
+	xorl	%r15d, %r15d
+	.p2align	4, 0x90
+.LBB99_2:                               # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	movq	72(%rbx), %rsi
+	addq	%r14, %rsi
+	xorl	%edi, %edi
+	callq	halide_device_free@PLT
+	movq	72(%rbx), %rax
+	movq	16(%rax,%r14), %rdi
+	callq	_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh@PLT
+	xorl	%edi, %edi
+	movq	%rax, %rsi
+	callq	halide_free@PLT
+	incq	%r15
+	movl	56(%rbx), %eax
+	addq	$56, %r14
+	cmpq	%rax, %r15
+	jb	.LBB99_2
+.LBB99_3:                               # %for.cond.cleanup
+	movq	24(%rbx), %rsi
+	xorl	%edi, %edi
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	jmp	halide_free@PLT                 # TAILCALL
+.Lfunc_end99:
+	.size	_ZN6Halide7Runtime8Internal10CacheEntry7destroyEv, .Lfunc_end99-_ZN6Halide7Runtime8Internal10CacheEntry7destroyEv
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh # -- Begin function _ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh,@function
+_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh: # @_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	leaq	-64(%rdi), %rax
+	popq	%rbp
+	retq
+.Lfunc_end100:
+	.size	_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh, .Lfunc_end100-_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal18copy_memory_helperERKNS1_11device_copyEixx,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal18copy_memory_helperERKNS1_11device_copyEixx # -- Begin function _ZN6Halide7Runtime8Internal18copy_memory_helperERKNS1_11device_copyEixx
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal18copy_memory_helperERKNS1_11device_copyEixx,@function
+_ZN6Halide7Runtime8Internal18copy_memory_helperERKNS1_11device_copyEixx: # @_ZN6Halide7Runtime8Internal18copy_memory_helperERKNS1_11device_copyEixx
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, %r12
+	movq	%rdx, %r14
+	movl	%esi, %r15d
+	movq	%rdi, %rbx
+	testl	%esi, %esi
+	js	.LBB101_8
+# %bb.1:                                # %land.rhs.preheader
+	movl	%r15d, %r15d
+	.p2align	4, 0x90
+.LBB101_2:                              # %land.rhs
+                                        # =>This Inner Loop Header: Depth=1
+	cmpq	$1, 24(%rbx,%r15,8)
+	jne	.LBB101_4
+# %bb.3:                                # %while.body
+                                        #   in Loop: Header=BB101_2 Depth=1
+	leaq	-1(%r15), %rax
+	testq	%r15, %r15
+	movq	%rax, %r15
+	jg	.LBB101_2
+	jmp	.LBB101_9
+.LBB101_8:                              # %while.end
+	cmpl	$-1, %r15d
+	je	.LBB101_9
+.LBB101_4:                              # %for.cond.preheader
+	movslq	%r15d, %rax
+	cmpq	$0, 24(%rbx,%rax,8)
+	je	.LBB101_7
+# %bb.5:                                # %for.body.lr.ph
+	decl	%r15d
+	xorl	%r13d, %r13d
+	movq	%rax, -48(%rbp)                 # 8-byte Spill
+	.p2align	4, 0x90
+.LBB101_6:                              # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rbx, %rdi
+	movl	%r15d, %esi
+	movq	%r14, %rdx
+	movq	%r12, %rcx
+	callq	_ZN6Halide7Runtime8Internal18copy_memory_helperERKNS1_11device_copyEixx@PLT
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	addq	152(%rbx,%rax,8), %r14
+	addq	280(%rbx,%rax,8), %r12
+	incq	%r13
+	cmpq	24(%rbx,%rax,8), %r13
+	jb	.LBB101_6
+.LBB101_7:                              # %if.end
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB101_9:                              # %if.then
+	addq	(%rbx), %r14
+	addq	8(%rbx), %r12
+	movq	408(%rbx), %rdx
+	movq	%r12, %rdi
+	movq	%r14, %rsi
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	jmp	memcpy@PLT                      # TAILCALL
+.Lfunc_end101:
+	.size	_ZN6Halide7Runtime8Internal18copy_memory_helperERKNS1_11device_copyEixx, .Lfunc_end101-_ZN6Halide7Runtime8Internal18copy_memory_helperERKNS1_11device_copyEixx
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal11copy_memoryERKNS1_11device_copyEPv,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal11copy_memoryERKNS1_11device_copyEPv # -- Begin function _ZN6Halide7Runtime8Internal11copy_memoryERKNS1_11device_copyEPv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal11copy_memoryERKNS1_11device_copyEPv,@function
+_ZN6Halide7Runtime8Internal11copy_memoryERKNS1_11device_copyEPv: # @_ZN6Halide7Runtime8Internal11copy_memoryERKNS1_11device_copyEPv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	(%rdi), %rax
+	cmpq	8(%rdi), %rax
+	jne	.LBB102_2
+# %bb.1:                                # %if.end
+	popq	%rbp
+	retq
+.LBB102_2:                              # %if.then
+	movq	16(%rdi), %rdx
+	movl	$15, %esi
+	xorl	%ecx, %ecx
+	popq	%rbp
+	jmp	_ZN6Halide7Runtime8Internal18copy_memory_helperERKNS1_11device_copyEixx@PLT # TAILCALL
+.Lfunc_end102:
+	.size	_ZN6Halide7Runtime8Internal11copy_memoryERKNS1_11device_copyEPv, .Lfunc_end102-_ZN6Halide7Runtime8Internal11copy_memoryERKNS1_11device_copyEPv
+                                        # -- End function
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3, 0x0                          # -- Begin function _ZN6Halide7Runtime8Internal16make_buffer_copyEPK15halide_buffer_tbS4_b
+.LCPI103_0:
+	.quad	1                               # 0x1
+	.section	.text._ZN6Halide7Runtime8Internal16make_buffer_copyEPK15halide_buffer_tbS4_b,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal16make_buffer_copyEPK15halide_buffer_tbS4_b
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal16make_buffer_copyEPK15halide_buffer_tbS4_b,@function
+_ZN6Halide7Runtime8Internal16make_buffer_copyEPK15halide_buffer_tbS4_b: # @_ZN6Halide7Runtime8Internal16make_buffer_copyEPK15halide_buffer_tbS4_b
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$712, %rsp                      # imm = 0x2C8
+	movq	%rdi, %rbx
+	testl	%edx, %edx
+	je	.LBB103_2
+# %bb.1:                                # %cond.true
+	movq	16(%rsi), %rax
+	movq	%rax, -712(%rbp)
+	testb	%r8b, %r8b
+	jne	.LBB103_4
+.LBB103_5:                              # %cond.false6
+	movq	(%rcx), %rax
+	jmp	.LBB103_6
+.LBB103_2:                              # %cond.false
+	movq	(%rsi), %rax
+	movq	%rax, -712(%rbp)
+	testb	%r8b, %r8b
+	je	.LBB103_5
+.LBB103_4:                              # %cond.true4
+	movq	16(%rcx), %rax
+.LBB103_6:                              # %cond.end8
+	movq	%rax, -704(%rbp)
+	movzbl	33(%rsi), %edx
+	leaq	7(%rdx), %rdi
+	shrq	$3, %rdi
+	movq	%rdi, -304(%rbp)
+	vbroadcastsd	.LCPI103_0(%rip), %ymm0 # ymm0 = [1,1,1,1]
+	vmovups	%ymm0, -688(%rbp)
+	vxorps	%xmm1, %xmm1, %xmm1
+	vmovups	%ymm1, -560(%rbp)
+	vmovups	%ymm1, -432(%rbp)
+	vmovups	%ymm0, -656(%rbp)
+	vmovups	%ymm1, -528(%rbp)
+	vmovups	%ymm1, -400(%rbp)
+	vmovups	%ymm0, -624(%rbp)
+	vmovups	%ymm1, -496(%rbp)
+	vmovups	%ymm1, -368(%rbp)
+	vmovups	%ymm0, -592(%rbp)
+	vmovups	%ymm1, -464(%rbp)
+	vmovups	%ymm1, -336(%rbp)
+	movl	36(%rsi), %eax
+	testl	%eax, %eax
+	jle	.LBB103_7
+# %bb.12:                               # %for.body19.lr.ph
+	movq	40(%rsi), %r9
+	movq	40(%rcx), %r10
+	movq	%rax, %r11
+	shlq	$4, %r11
+	xorl	%r14d, %r14d
+	xorl	%r8d, %r8d
+	.p2align	4, 0x90
+.LBB103_13:                             # %for.body19
+                                        # =>This Inner Loop Header: Depth=1
+	movslq	8(%r9,%r14), %r15
+	movslq	(%r10,%r14), %r12
+	movslq	(%r9,%r14), %r13
+	subq	%r13, %r12
+	imulq	%r15, %r12
+	addq	%r12, %r8
+	addq	$16, %r14
+	cmpq	%r14, %r11
+	jne	.LBB103_13
+# %bb.8:                                # %for.cond.cleanup18
+	imulq	%rdi, %r8
+	movq	%r8, -696(%rbp)
+	cmpl	36(%rcx), %eax
+	je	.LBB103_9
+	jmp	.LBB103_11
+.LBB103_7:
+	xorl	%r8d, %r8d
+	imulq	%rdi, %r8
+	movq	%r8, -696(%rbp)
+	cmpl	36(%rcx), %eax
+	jne	.LBB103_11
+.LBB103_9:                              # %lor.lhs.false
+	movzbl	33(%rcx), %r8d
+	addl	$7, %r8d
+	shrl	$3, %r8d
+	cmpl	%r8d, %edi
+	jne	.LBB103_11
+# %bb.10:                               # %lor.lhs.false
+	cmpl	$17, %eax
+	jge	.LBB103_11
+# %bb.14:                               # %if.end
+	testb	%dl, %dl
+	je	.LBB103_11
+# %bb.15:                               # %for.cond54.preheader
+	testl	%eax, %eax
+	jle	.LBB103_16
+# %bb.37:                               # %for.body58.lr.ph
+	movq	40(%rcx), %rcx
+	movq	40(%rsi), %rdx
+	xorl	%esi, %esi
+	jmp	.LBB103_38
+	.p2align	4, 0x90
+.LBB103_47:                             # %for.cond.cleanup94
+                                        #   in Loop: Header=BB103_38 Depth=1
+	movslq	4(%rcx,%r9), %r9
+	movq	%r9, -688(%rbp,%r11,8)
+	movq	%r8, -432(%rbp,%r11,8)
+	movq	%r10, -560(%rbp,%r11,8)
+	incq	%rsi
+	cmpq	%rax, %rsi
+	je	.LBB103_17
+.LBB103_38:                             # %for.body58
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB103_41 Depth 2
+                                        #     Child Loop BB103_46 Depth 2
+	movq	%rsi, %r9
+	shlq	$4, %r9
+	movslq	8(%rcx,%r9), %r8
+	imulq	%rdi, %r8
+	movl	$0, %r11d
+	testq	%rsi, %rsi
+	je	.LBB103_44
+# %bb.39:                               # %for.body81.lr.ph
+                                        #   in Loop: Header=BB103_38 Depth=1
+	testq	%r8, %r8
+	je	.LBB103_48
+# %bb.40:                               # %for.body81.preheader
+                                        #   in Loop: Header=BB103_38 Depth=1
+	xorl	%r11d, %r11d
+	.p2align	4, 0x90
+.LBB103_41:                             # %for.body81
+                                        #   Parent Loop BB103_38 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	cmpq	-432(%rbp,%r11,8), %r8
+	jb	.LBB103_44
+# %bb.42:                               # %for.inc89
+                                        #   in Loop: Header=BB103_41 Depth=2
+	incq	%r11
+	cmpq	%r11, %rsi
+	jne	.LBB103_41
+# %bb.43:                               #   in Loop: Header=BB103_38 Depth=1
+	movq	%rsi, %r11
+	jmp	.LBB103_44
+	.p2align	4, 0x90
+.LBB103_48:                             # %for.body81.us.preheader
+                                        #   in Loop: Header=BB103_38 Depth=1
+	movl	%esi, %r11d
+.LBB103_44:                             # %for.end91
+                                        #   in Loop: Header=BB103_38 Depth=1
+	movslq	8(%rdx,%r9), %r10
+	imulq	%rdi, %r10
+	movl	%r11d, %r11d
+	cmpq	%r11, %rsi
+	jbe	.LBB103_47
+# %bb.45:                               # %for.body95.preheader
+                                        #   in Loop: Header=BB103_38 Depth=1
+	movslq	%r11d, %r14
+	movq	%rsi, %r15
+	.p2align	4, 0x90
+.LBB103_46:                             # %for.body95
+                                        #   Parent Loop BB103_38 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	-696(%rbp,%r15,8), %r12
+	movq	-568(%rbp,%r15,8), %r13
+	movq	%r12, -688(%rbp,%r15,8)
+	movq	-440(%rbp,%r15,8), %r12
+	movq	%r12, -432(%rbp,%r15,8)
+	movq	%r13, -560(%rbp,%r15,8)
+	leaq	-1(%r15), %r12
+	movq	%r12, %r15
+	cmpq	%r14, %r12
+	jg	.LBB103_46
+	jmp	.LBB103_47
+.LBB103_11:                             # %if.then
+	vxorps	%xmm0, %xmm0, %xmm0
+	vmovups	%ymm0, 384(%rbx)
+	vmovups	%ymm0, 352(%rbx)
+	vmovups	%ymm0, 320(%rbx)
+	vmovups	%ymm0, 288(%rbx)
+	vmovups	%ymm0, 256(%rbx)
+	vmovups	%ymm0, 224(%rbx)
+	vmovups	%ymm0, 192(%rbx)
+	vmovups	%ymm0, 160(%rbx)
+	vmovups	%ymm0, 128(%rbx)
+	vmovups	%ymm0, 96(%rbx)
+	vmovups	%ymm0, 64(%rbx)
+	vmovups	%ymm0, 32(%rbx)
+	vmovups	%ymm0, (%rbx)
+.LBB103_36:                             # %while.end
+	movq	%rbx, %rax
+	addq	$712, %rsp                      # imm = 0x2C8
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	vzeroupper
+	retq
+.LBB103_17:                             # %while.cond.preheader
+	movq	-304(%rbp), %rdi
+	testq	%rdi, %rdi
+	je	.LBB103_35
+# %bb.18:                               # %while.cond.preheader.land.lhs.true138.lr.ph_crit_edge
+	movq	-560(%rbp), %rdx
+	movq	-432(%rbp), %r12
+	movq	-680(%rbp), %rax
+	movq	%rax, -744(%rbp)                # 8-byte Spill
+	movq	-672(%rbp), %rax
+	movq	%rax, -720(%rbp)                # 8-byte Spill
+	movq	-552(%rbp), %rax
+	movq	-424(%rbp), %r14
+	movq	-544(%rbp), %rcx
+	movq	%rcx, -728(%rbp)                # 8-byte Spill
+	movq	-416(%rbp), %r10
+	movq	-664(%rbp), %rcx
+	movq	%rcx, -288(%rbp)                # 8-byte Spill
+	movq	-536(%rbp), %rcx
+	movq	%rcx, -296(%rbp)                # 8-byte Spill
+	movq	-408(%rbp), %rcx
+	movq	%rcx, -280(%rbp)                # 8-byte Spill
+	movq	-656(%rbp), %rcx
+	movq	%rcx, -264(%rbp)                # 8-byte Spill
+	movq	-528(%rbp), %rcx
+	movq	%rcx, -72(%rbp)                 # 8-byte Spill
+	movq	-400(%rbp), %rcx
+	movq	%rcx, -48(%rbp)                 # 8-byte Spill
+	movq	-648(%rbp), %rcx
+	movq	%rcx, -80(%rbp)                 # 8-byte Spill
+	movq	-520(%rbp), %rcx
+	movq	%rcx, -240(%rbp)                # 8-byte Spill
+	movq	-392(%rbp), %rcx
+	movq	%rcx, -88(%rbp)                 # 8-byte Spill
+	movq	-640(%rbp), %rcx
+	movq	%rcx, -96(%rbp)                 # 8-byte Spill
+	movq	-512(%rbp), %rcx
+	movq	%rcx, -248(%rbp)                # 8-byte Spill
+	movq	-384(%rbp), %rcx
+	movq	%rcx, -104(%rbp)                # 8-byte Spill
+	movq	-632(%rbp), %rcx
+	movq	%rcx, -112(%rbp)                # 8-byte Spill
+	movq	-504(%rbp), %rcx
+	movq	-376(%rbp), %rsi
+	movq	%rsi, -120(%rbp)                # 8-byte Spill
+	movq	-624(%rbp), %rsi
+	movq	%rsi, -128(%rbp)                # 8-byte Spill
+	movq	-496(%rbp), %r15
+	movq	-368(%rbp), %rsi
+	movq	%rsi, -136(%rbp)                # 8-byte Spill
+	movq	-616(%rbp), %rsi
+	movq	%rsi, -144(%rbp)                # 8-byte Spill
+	movq	-488(%rbp), %r13
+	movq	-360(%rbp), %rsi
+	movq	%rsi, -152(%rbp)                # 8-byte Spill
+	movq	-608(%rbp), %rsi
+	movq	%rsi, -160(%rbp)                # 8-byte Spill
+	movq	-480(%rbp), %r11
+	movq	-352(%rbp), %rsi
+	movq	%rsi, -168(%rbp)                # 8-byte Spill
+	movq	-600(%rbp), %rsi
+	movq	%rsi, -176(%rbp)                # 8-byte Spill
+	movq	-472(%rbp), %r9
+	movq	-344(%rbp), %rsi
+	movq	%rsi, -184(%rbp)                # 8-byte Spill
+	movq	-592(%rbp), %rsi
+	movq	%rsi, -192(%rbp)                # 8-byte Spill
+	movq	-464(%rbp), %rsi
+	movq	%rsi, -56(%rbp)                 # 8-byte Spill
+	movq	-336(%rbp), %r8
+	movq	%r8, -200(%rbp)                 # 8-byte Spill
+	movq	-584(%rbp), %r8
+	movq	%r8, -208(%rbp)                 # 8-byte Spill
+	movq	-456(%rbp), %r8
+	movq	%r8, -64(%rbp)                  # 8-byte Spill
+	movq	-328(%rbp), %r8
+	movq	%r8, -216(%rbp)                 # 8-byte Spill
+	movq	-576(%rbp), %r8
+	movq	%r8, -224(%rbp)                 # 8-byte Spill
+	movq	-448(%rbp), %r8
+	movq	%r8, -256(%rbp)                 # 8-byte Spill
+	movq	%r9, %rsi
+	movq	%r11, %r9
+	movq	%r13, %r11
+	movq	%r15, %r13
+	movq	%rcx, %r15
+	movq	-320(%rbp), %rcx
+	movq	%rcx, -232(%rbp)                # 8-byte Spill
+	movq	-568(%rbp), %rcx
+	movq	-440(%rbp), %r8
+	movq	%r8, -272(%rbp)                 # 8-byte Spill
+	movq	-312(%rbp), %r8
+	jmp	.LBB103_19
+.LBB103_16:
+	movl	$1, %ecx
+	xorl	%r8d, %r8d
+	xorl	%eax, %eax
+	movq	%rax, -272(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -232(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -256(%rbp)                # 8-byte Spill
+	movl	$1, %eax
+	movq	%rax, -224(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -216(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -64(%rbp)                 # 8-byte Spill
+	movl	$1, %eax
+	movq	%rax, -208(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -200(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -56(%rbp)                 # 8-byte Spill
+	movl	$1, %eax
+	movq	%rax, -192(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -184(%rbp)                # 8-byte Spill
+	xorl	%esi, %esi
+	movl	$1, %eax
+	movq	%rax, -176(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -168(%rbp)                # 8-byte Spill
+	xorl	%r9d, %r9d
+	movl	$1, %eax
+	movq	%rax, -160(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -152(%rbp)                # 8-byte Spill
+	xorl	%r11d, %r11d
+	movl	$1, %eax
+	movq	%rax, -144(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -136(%rbp)                # 8-byte Spill
+	xorl	%r13d, %r13d
+	movl	$1, %eax
+	movq	%rax, -128(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -120(%rbp)                # 8-byte Spill
+	xorl	%r15d, %r15d
+	movl	$1, %eax
+	movq	%rax, -112(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -104(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -248(%rbp)                # 8-byte Spill
+	movl	$1, %eax
+	movq	%rax, -96(%rbp)                 # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -88(%rbp)                 # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -240(%rbp)                # 8-byte Spill
+	movl	$1, %eax
+	movq	%rax, -80(%rbp)                 # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -48(%rbp)                 # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -72(%rbp)                 # 8-byte Spill
+	movl	$1, %eax
+	movq	%rax, -264(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -280(%rbp)                # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rax, -296(%rbp)                # 8-byte Spill
+	movl	$1, %eax
+	movq	%rax, -288(%rbp)                # 8-byte Spill
+	xorl	%r10d, %r10d
+	xorl	%eax, %eax
+	movq	%rax, -728(%rbp)                # 8-byte Spill
+	movl	$1, %eax
+	movq	%rax, -720(%rbp)                # 8-byte Spill
+	xorl	%r14d, %r14d
+	xorl	%eax, %eax
+	movl	$1, %edx
+	movq	%rdx, -744(%rbp)                # 8-byte Spill
+	xorl	%r12d, %r12d
+	xorl	%edx, %edx
+.LBB103_19:                             # %land.lhs.true138.lr.ph
+	cmpq	%rdx, %rdi
+	jne	.LBB103_35
+# %bb.20:                               # %land.lhs.true138.lr.ph
+	cmpq	%r12, %rdi
+	jne	.LBB103_35
+# %bb.21:                               # %while.body.peel
+	imulq	-688(%rbp), %r12
+	movq	%r12, -304(%rbp)
+	movq	-744(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -688(%rbp)
+	movq	%rax, -560(%rbp)
+	movq	%r14, -432(%rbp)
+	movq	-720(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -680(%rbp)
+	movq	-728(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -552(%rbp)
+	movq	%r10, -424(%rbp)
+	movq	-288(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -672(%rbp)
+	movq	-296(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -544(%rbp)
+	movq	%rcx, -736(%rbp)                # 8-byte Spill
+	movq	-280(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -416(%rbp)
+	movq	-264(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -664(%rbp)
+	movq	-72(%rbp), %rcx                 # 8-byte Reload
+	movq	%rcx, -536(%rbp)
+	movq	-48(%rbp), %rcx                 # 8-byte Reload
+	movq	%rcx, -408(%rbp)
+	movq	-80(%rbp), %rcx                 # 8-byte Reload
+	movq	%rcx, -656(%rbp)
+	movq	-240(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -528(%rbp)
+	movq	-88(%rbp), %rcx                 # 8-byte Reload
+	movq	%rcx, -400(%rbp)
+	movq	-96(%rbp), %rcx                 # 8-byte Reload
+	movq	%rcx, -648(%rbp)
+	movq	-248(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -520(%rbp)
+	movq	-104(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -392(%rbp)
+	movq	-112(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -640(%rbp)
+	movq	%r15, -512(%rbp)
+	movq	-120(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -384(%rbp)
+	movq	-128(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -632(%rbp)
+	movq	%r13, -504(%rbp)
+	movq	-136(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -376(%rbp)
+	movq	-144(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -624(%rbp)
+	movq	%r11, -496(%rbp)
+	movq	-152(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -368(%rbp)
+	movq	-160(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -616(%rbp)
+	movq	%r9, -488(%rbp)
+	movq	-168(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -360(%rbp)
+	movq	-176(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -608(%rbp)
+	movq	%rsi, -480(%rbp)
+	movq	-184(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -352(%rbp)
+	movq	-192(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -600(%rbp)
+	movq	-56(%rbp), %rcx                 # 8-byte Reload
+	movq	%rcx, -472(%rbp)
+	movq	-200(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -344(%rbp)
+	movq	-208(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -592(%rbp)
+	movq	-64(%rbp), %rcx                 # 8-byte Reload
+	movq	%rcx, -464(%rbp)
+	movq	-216(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -336(%rbp)
+	movq	-224(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -584(%rbp)
+	movq	-256(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -456(%rbp)
+	movq	-232(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -328(%rbp)
+	movq	-736(%rbp), %rcx                # 8-byte Reload
+	movq	%rcx, -576(%rbp)
+	movq	-272(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -448(%rbp)
+	movq	%r8, -320(%rbp)
+	movq	$1, -568(%rbp)
+	movq	$0, -440(%rbp)
+	movq	$0, -312(%rbp)
+	testq	%r12, %r12
+	je	.LBB103_35
+# %bb.22:                               # %land.lhs.true138.peel291
+	cmpq	%rax, %r12
+	jne	.LBB103_35
+# %bb.23:                               # %land.lhs.true138.peel291
+	cmpq	%r14, %r12
+	jne	.LBB103_35
+# %bb.24:                               # %while.body.peel295
+	imulq	-744(%rbp), %r14                # 8-byte Folded Reload
+	movq	%r14, -304(%rbp)
+	movq	-720(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -688(%rbp)
+	movq	-728(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -560(%rbp)
+	movq	%r10, -432(%rbp)
+	movq	-288(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -680(%rbp)
+	movq	-296(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -552(%rbp)
+	movq	-280(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -424(%rbp)
+	movq	-264(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -672(%rbp)
+	movq	-72(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -544(%rbp)
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -416(%rbp)
+	movq	-80(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -664(%rbp)
+	movq	-240(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -536(%rbp)
+	movq	-88(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -408(%rbp)
+	movq	-96(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -656(%rbp)
+	movq	-248(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -528(%rbp)
+	movq	-104(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -400(%rbp)
+	movq	-112(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -648(%rbp)
+	movq	%r15, -520(%rbp)
+	movq	-120(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -392(%rbp)
+	movq	-128(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -640(%rbp)
+	movq	%r13, -512(%rbp)
+	movq	-136(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -384(%rbp)
+	movq	-144(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -632(%rbp)
+	movq	%r11, -504(%rbp)
+	movq	-152(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -376(%rbp)
+	movq	-160(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -624(%rbp)
+	movq	%r9, -496(%rbp)
+	movq	-168(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -368(%rbp)
+	movq	-176(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -616(%rbp)
+	movq	%rsi, -488(%rbp)
+	movq	-184(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -360(%rbp)
+	movq	-192(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -608(%rbp)
+	movq	-56(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -480(%rbp)
+	movq	-200(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -352(%rbp)
+	movq	-208(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -600(%rbp)
+	movq	-64(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -472(%rbp)
+	movq	-216(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -344(%rbp)
+	movq	-224(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -592(%rbp)
+	movq	-256(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -464(%rbp)
+	movq	-232(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -336(%rbp)
+	movq	%rcx, -584(%rbp)
+	movq	-272(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -456(%rbp)
+	movq	%r8, -328(%rbp)
+	movq	$1, -576(%rbp)
+	vxorps	%xmm1, %xmm1, %xmm1
+	vmovups	%xmm1, -448(%rbp)
+	movq	$0, -320(%rbp)
+	testq	%r14, %r14
+	je	.LBB103_35
+# %bb.25:                               # %land.lhs.true138.peel299
+	cmpq	-728(%rbp), %r14                # 8-byte Folded Reload
+	jne	.LBB103_35
+# %bb.26:                               # %land.lhs.true138.peel299
+	cmpq	%r10, %r14
+	jne	.LBB103_35
+# %bb.27:                               # %while.body.peel303
+	imulq	-720(%rbp), %r10                # 8-byte Folded Reload
+	movq	%r10, -304(%rbp)
+	movq	-288(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -688(%rbp)
+	movq	-296(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -560(%rbp)
+	movq	-280(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -432(%rbp)
+	movq	-264(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -680(%rbp)
+	movq	-72(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -552(%rbp)
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -424(%rbp)
+	movq	-80(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -672(%rbp)
+	movq	-240(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -544(%rbp)
+	movq	-88(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -416(%rbp)
+	movq	-96(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -664(%rbp)
+	movq	-248(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -536(%rbp)
+	movq	-104(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -408(%rbp)
+	movq	-112(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -656(%rbp)
+	movq	%r15, -528(%rbp)
+	movq	-120(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -400(%rbp)
+	movq	-128(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -648(%rbp)
+	movq	%r13, -520(%rbp)
+	movq	-136(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -392(%rbp)
+	movq	-144(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -640(%rbp)
+	movq	%r11, -512(%rbp)
+	movq	-152(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -384(%rbp)
+	movq	-160(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -632(%rbp)
+	movq	%r9, -504(%rbp)
+	movq	-168(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -376(%rbp)
+	movq	-176(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -624(%rbp)
+	movq	%rsi, -496(%rbp)
+	movq	-184(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -368(%rbp)
+	movq	-192(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -616(%rbp)
+	movq	-56(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -488(%rbp)
+	movq	-200(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -360(%rbp)
+	movq	-208(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -608(%rbp)
+	movq	-64(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -480(%rbp)
+	movq	-216(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -352(%rbp)
+	movq	-224(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -600(%rbp)
+	movq	-256(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -472(%rbp)
+	movq	-232(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -344(%rbp)
+	movq	%rcx, -592(%rbp)
+	movq	-272(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -464(%rbp)
+	movq	%r8, -336(%rbp)
+	movq	$1, -584(%rbp)
+	vmovups	%xmm1, -456(%rbp)
+	movq	$0, -328(%rbp)
+	movq	$1, -568(%rbp)
+	movq	$0, -312(%rbp)
+	testq	%r10, %r10
+	je	.LBB103_35
+# %bb.28:                               # %land.lhs.true138.peel307
+	movq	%r10, %rax
+	cmpq	-296(%rbp), %r10                # 8-byte Folded Reload
+	jne	.LBB103_35
+# %bb.29:                               # %land.lhs.true138.peel307
+	cmpq	-280(%rbp), %rax                # 8-byte Folded Reload
+	jne	.LBB103_35
+# %bb.30:                               # %while.body.peel311
+	movq	-280(%rbp), %rcx                # 8-byte Reload
+	imulq	-288(%rbp), %rcx                # 8-byte Folded Reload
+	movq	%rcx, -304(%rbp)
+	movq	-264(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -688(%rbp)
+	movq	-72(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -560(%rbp)
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -432(%rbp)
+	movq	-80(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -680(%rbp)
+	movq	-240(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -552(%rbp)
+	movq	-88(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -424(%rbp)
+	movq	-96(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -672(%rbp)
+	movq	-248(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -544(%rbp)
+	movq	-104(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -416(%rbp)
+	movq	-112(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -664(%rbp)
+	movq	%r15, -536(%rbp)
+	movq	-120(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -408(%rbp)
+	movq	-128(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -656(%rbp)
+	movq	%r13, -528(%rbp)
+	movq	-136(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -400(%rbp)
+	movq	-144(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -648(%rbp)
+	movq	%r11, -520(%rbp)
+	movq	-152(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -392(%rbp)
+	movq	-160(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -640(%rbp)
+	movq	%r9, -512(%rbp)
+	movq	-168(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -384(%rbp)
+	movq	-176(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -632(%rbp)
+	movq	%rsi, -504(%rbp)
+	movq	-184(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -376(%rbp)
+	movq	-192(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -624(%rbp)
+	movq	-56(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -496(%rbp)
+	movq	-200(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -368(%rbp)
+	movq	-208(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -616(%rbp)
+	movq	-64(%rbp), %rax                 # 8-byte Reload
+	movq	%rax, -488(%rbp)
+	movq	-216(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -360(%rbp)
+	movq	-224(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -608(%rbp)
+	movq	-256(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -480(%rbp)
+	movq	-232(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -352(%rbp)
+	movq	-736(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -600(%rbp)
+	movq	-272(%rbp), %rax                # 8-byte Reload
+	movq	%rax, -472(%rbp)
+	movq	%r8, -344(%rbp)
+	movq	$1, -592(%rbp)
+	vmovups	%xmm1, -464(%rbp)
+	movq	$0, -336(%rbp)
+	movq	$1, -576(%rbp)
+	movq	$0, -320(%rbp)
+	movq	$0, -440(%rbp)
+	movq	%rcx, %rdi
+	testq	%rcx, %rcx
+	movq	-736(%rbp), %rcx                # 8-byte Reload
+	je	.LBB103_35
+# %bb.31:
+	vxorps	%xmm1, %xmm1, %xmm1
+.LBB103_32:                             # %land.lhs.true138
+                                        # =>This Inner Loop Header: Depth=1
+	movq	-72(%rbp), %rdx                 # 8-byte Reload
+	movq	-80(%rbp), %rax                 # 8-byte Reload
+	movq	%rcx, %r12
+	movq	-88(%rbp), %rcx                 # 8-byte Reload
+	movq	-96(%rbp), %r14                 # 8-byte Reload
+	movq	%r14, -80(%rbp)                 # 8-byte Spill
+	movq	-104(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -88(%rbp)                 # 8-byte Spill
+	movq	-112(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -96(%rbp)                 # 8-byte Spill
+	movq	-120(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -104(%rbp)                # 8-byte Spill
+	movq	-128(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -112(%rbp)                # 8-byte Spill
+	movq	-136(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -120(%rbp)                # 8-byte Spill
+	movq	-144(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -128(%rbp)                # 8-byte Spill
+	movq	-152(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -136(%rbp)                # 8-byte Spill
+	movq	-160(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -144(%rbp)                # 8-byte Spill
+	movq	-168(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -152(%rbp)                # 8-byte Spill
+	movq	-176(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -160(%rbp)                # 8-byte Spill
+	movq	-184(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -168(%rbp)                # 8-byte Spill
+	movq	-192(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -176(%rbp)                # 8-byte Spill
+	movq	-200(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -184(%rbp)                # 8-byte Spill
+	movq	-208(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -192(%rbp)                # 8-byte Spill
+	movq	-216(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -200(%rbp)                # 8-byte Spill
+	movq	-224(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -208(%rbp)                # 8-byte Spill
+	movq	-232(%rbp), %r14                # 8-byte Reload
+	movq	%r14, -216(%rbp)                # 8-byte Spill
+	movq	%r12, -224(%rbp)                # 8-byte Spill
+	movq	%r8, -232(%rbp)                 # 8-byte Spill
+	cmpq	%rdx, %rdi
+	movq	-240(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -72(%rbp)                 # 8-byte Spill
+	movq	-248(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -240(%rbp)                # 8-byte Spill
+	movq	%r15, -248(%rbp)                # 8-byte Spill
+	movq	%r13, %r15
+	movq	%r11, %r13
+	movq	%r9, %r11
+	movq	%rsi, %r9
+	movq	-56(%rbp), %rsi                 # 8-byte Reload
+	movq	-64(%rbp), %rdx                 # 8-byte Reload
+	movq	%rdx, -56(%rbp)                 # 8-byte Spill
+	movq	-256(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -64(%rbp)                 # 8-byte Spill
+	movq	-272(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -256(%rbp)                # 8-byte Spill
+	jne	.LBB103_35
+# %bb.33:                               # %land.lhs.true138
+                                        #   in Loop: Header=BB103_32 Depth=1
+	cmpq	-48(%rbp), %rdi                 # 8-byte Folded Reload
+	jne	.LBB103_35
+# %bb.34:                               # %while.body
+                                        #   in Loop: Header=BB103_32 Depth=1
+	movq	-48(%rbp), %r14                 # 8-byte Reload
+	imulq	-264(%rbp), %r14                # 8-byte Folded Reload
+	movq	%r14, -304(%rbp)
+	movq	%rax, -688(%rbp)
+	movq	-72(%rbp), %rdx                 # 8-byte Reload
+	movq	%rdx, -560(%rbp)
+	movq	%rcx, -432(%rbp)
+	movq	-80(%rbp), %rdx                 # 8-byte Reload
+	movq	%rdx, -680(%rbp)
+	movq	-240(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -552(%rbp)
+	movq	-88(%rbp), %rdx                 # 8-byte Reload
+	movq	%rdx, -424(%rbp)
+	movq	-96(%rbp), %rdx                 # 8-byte Reload
+	movq	%rdx, -672(%rbp)
+	movq	-248(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -544(%rbp)
+	movq	-104(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -416(%rbp)
+	movq	-112(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -664(%rbp)
+	movq	%r15, -536(%rbp)
+	movq	-120(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -408(%rbp)
+	movq	-128(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -656(%rbp)
+	movq	%r13, -528(%rbp)
+	movq	-136(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -400(%rbp)
+	movq	-144(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -648(%rbp)
+	movq	%r11, -520(%rbp)
+	movq	-152(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -392(%rbp)
+	movq	-160(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -640(%rbp)
+	movq	%r9, -512(%rbp)
+	movq	-168(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -384(%rbp)
+	movq	-176(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -632(%rbp)
+	movq	%rsi, -504(%rbp)
+	movq	-184(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -376(%rbp)
+	movq	-192(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -624(%rbp)
+	movq	-56(%rbp), %rdx                 # 8-byte Reload
+	movq	%rdx, -496(%rbp)
+	movq	-200(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -368(%rbp)
+	movq	-208(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -616(%rbp)
+	movq	-64(%rbp), %rdx                 # 8-byte Reload
+	movq	%rdx, -488(%rbp)
+	movq	-216(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -360(%rbp)
+	movq	-224(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -608(%rbp)
+	movq	-256(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -480(%rbp)
+	movq	-232(%rbp), %rdx                # 8-byte Reload
+	movq	%rdx, -352(%rbp)
+	vmovups	%ymm0, -600(%rbp)
+	vmovups	%ymm1, -472(%rbp)
+	vmovups	%ymm1, -344(%rbp)
+	movq	$1, -568(%rbp)
+	movq	$0, -440(%rbp)
+	movq	$0, -312(%rbp)
+	movl	$0, %r8d
+	movl	$0, %edx
+	movq	%rdx, -272(%rbp)                # 8-byte Spill
+	movq	%rax, -264(%rbp)                # 8-byte Spill
+	movq	%rcx, -48(%rbp)                 # 8-byte Spill
+	movl	$1, %ecx
+	movq	%r14, %rdi
+	testq	%r14, %r14
+	jne	.LBB103_32
+.LBB103_35:                             # %while.end
+	leaq	-712(%rbp), %rsi
+	movl	$416, %edx                      # imm = 0x1A0
+	movq	%rbx, %rdi
+	vzeroupper
+	callq	memcpy@PLT
+	jmp	.LBB103_36
+.Lfunc_end103:
+	.size	_ZN6Halide7Runtime8Internal16make_buffer_copyEPK15halide_buffer_tbS4_b, .Lfunc_end103-_ZN6Halide7Runtime8Internal16make_buffer_copyEPK15halide_buffer_tbS4_b
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m # -- Begin function _ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m,@function
+_ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m: # @_ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	callq	memcmp@PLT
+	testl	%eax, %eax
+	sete	%al
+	popq	%rbp
+	retq
+.Lfunc_end104:
+	.size	_ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m, .Lfunc_end104-_ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t # -- Begin function _ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t,@function
+_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t: # @_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movl	36(%rdi), %ecx
+	testl	%ecx, %ecx
+	jle	.LBB105_10
+# %bb.1:                                # %for.body.lr.ph
+	movq	40(%rdi), %rdx
+	movl	(%rdx), %eax
+	cmpl	(%rsi), %eax
+	jne	.LBB105_12
+# %bb.2:                                # %land.lhs.true.i.i.preheader
+	movl	4(%rdx), %eax
+	cmpl	4(%rsi), %eax
+	jne	.LBB105_12
+# %bb.3:                                # %land.lhs.true5.i.i.preheader
+	movl	8(%rdx), %eax
+	cmpl	8(%rsi), %eax
+	jne	.LBB105_12
+# %bb.4:                                # %_ZNK18halide_dimension_tneERKS_.exit.preheader
+	movl	$24, %r8d
+	movl	$1, %edi
+	xorl	%eax, %eax
+	.p2align	4, 0x90
+.LBB105_5:                              # %_ZNK18halide_dimension_tneERKS_.exit
+                                        # =>This Inner Loop Header: Depth=1
+	movl	-12(%rdx,%r8), %r9d
+	cmpl	-12(%rsi,%r8), %r9d
+	jne	.LBB105_11
+# %bb.6:                                # %for.cond
+                                        #   in Loop: Header=BB105_5 Depth=1
+	cmpq	%rcx, %rdi
+	setae	%al
+	je	.LBB105_11
+# %bb.7:                                # %for.body
+                                        #   in Loop: Header=BB105_5 Depth=1
+	movl	-8(%rdx,%r8), %r9d
+	cmpl	-8(%rsi,%r8), %r9d
+	jne	.LBB105_11
+# %bb.8:                                # %land.lhs.true.i.i
+                                        #   in Loop: Header=BB105_5 Depth=1
+	movl	-4(%rdx,%r8), %r9d
+	cmpl	-4(%rsi,%r8), %r9d
+	jne	.LBB105_11
+# %bb.9:                                # %land.lhs.true5.i.i
+                                        #   in Loop: Header=BB105_5 Depth=1
+	movl	(%rdx,%r8), %r9d
+	leaq	16(%r8), %r10
+	incq	%rdi
+	cmpl	(%rsi,%r8), %r9d
+	movq	%r10, %r8
+	je	.LBB105_5
+	jmp	.LBB105_11
+.LBB105_12:
+	xorl	%eax, %eax
+	andb	$1, %al
+                                        # kill: def $al killed $al killed $eax
+	popq	%rbp
+	retq
+.LBB105_10:
+	movb	$1, %al
+.LBB105_11:                             # %cleanup
+	andb	$1, %al
+                                        # kill: def $al killed $al killed $eax
+	popq	%rbp
+	retq
+.Lfunc_end105:
+	.size	_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t, .Lfunc_end105-_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal10CacheEntry4initEPKhmjPK15halide_buffer_tiPPS5_by,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal10CacheEntry4initEPKhmjPK15halide_buffer_tiPPS5_by # -- Begin function _ZN6Halide7Runtime8Internal10CacheEntry4initEPKhmjPK15halide_buffer_tiPPS5_by
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal10CacheEntry4initEPKhmjPK15halide_buffer_tiPPS5_by,@function
+_ZN6Halide7Runtime8Internal10CacheEntry4initEPKhmjPK15halide_buffer_tiPPS5_by: # @_ZN6Halide7Runtime8Internal10CacheEntry4initEPKhmjPK15halide_buffer_tiPPS5_by
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movl	%r9d, %r12d
+	movq	%r8, %r14
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	vxorps	%xmm0, %xmm0, %xmm0
+	vmovups	%xmm0, (%rdi)
+	movq	$0, 16(%rdi)
+	movq	%rdx, 32(%rdi)
+	movl	%ecx, 48(%rdi)
+	movl	$0, 52(%rdi)
+	movl	%r9d, 56(%rdi)
+	movslq	36(%r8), %rax
+	movl	%eax, 60(%rdi)
+	movl	%r9d, %ecx
+	imulq	$56, %rcx, %r13
+	incl	%r12d
+	imulq	%rax, %r12
+	shlq	$4, %r12
+	addq	%r13, %r12
+	leaq	(%rdx,%r12), %rsi
+	xorl	%edi, %edi
+	callq	halide_malloc@PLT
+	movq	%rax, 24(%rbx)
+	testq	%rax, %rax
+	je	.LBB106_12
+# %bb.1:                                # %if.end
+	movq	%rax, 72(%rbx)
+	addq	%rax, %r13
+	movq	%r13, 64(%rbx)
+	addq	%rax, %r12
+	movq	%r12, 40(%rbx)
+	cmpq	$0, 32(%rbx)
+	je	.LBB106_4
+# %bb.2:                                # %for.body.preheader
+	xorl	%ecx, %ecx
+	.p2align	4, 0x90
+.LBB106_3:                              # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	movzbl	(%r15,%rcx), %edx
+	movq	40(%rbx), %rsi
+	movb	%dl, (%rsi,%rcx)
+	incq	%rcx
+	cmpq	32(%rbx), %rcx
+	jb	.LBB106_3
+.LBB106_4:                              # %for.cond23.preheader
+	cmpl	$0, 60(%rbx)
+	jle	.LBB106_7
+# %bb.5:                                # %for.body27.lr.ph
+	xorl	%ecx, %ecx
+	xorl	%edx, %edx
+	.p2align	4, 0x90
+.LBB106_6:                              # %for.body27
+                                        # =>This Inner Loop Header: Depth=1
+	movq	40(%r14), %rsi
+	movq	64(%rbx), %rdi
+	vmovups	(%rsi,%rcx), %xmm0
+	vmovups	%xmm0, (%rdi,%rcx)
+	incq	%rdx
+	movslq	60(%rbx), %rsi
+	addq	$16, %rcx
+	cmpq	%rsi, %rdx
+	jl	.LBB106_6
+.LBB106_7:                              # %for.cond36.preheader
+	movq	32(%rbp), %rcx
+	movzbl	24(%rbp), %edx
+	cmpl	$0, 56(%rbx)
+	je	.LBB106_11
+# %bb.8:                                # %for.body40.preheader
+	movq	16(%rbp), %rsi
+	xorl	%edi, %edi
+	jmp	.LBB106_9
+	.p2align	4, 0x90
+.LBB106_10:                             # %for.cond36.loopexit
+                                        #   in Loop: Header=BB106_9 Depth=1
+	movl	56(%rbx), %r8d
+	cmpq	%r8, %rdi
+	jae	.LBB106_11
+.LBB106_9:                              # %for.body40
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB106_14 Depth 2
+	movq	%rdi, %r8
+	movq	(%rsi,%rdi,8), %rdi
+	movq	72(%rbx), %r10
+	imulq	$56, %r8, %r9
+	vmovups	(%rdi), %ymm0
+	vmovups	24(%rdi), %ymm1
+	vmovups	%ymm1, 24(%r10,%r9)
+	vmovups	%ymm0, (%r10,%r9)
+	leaq	1(%r8), %rdi
+	movl	60(%rbx), %r10d
+	movl	%edi, %r11d
+	imull	%r10d, %r11d
+	shlq	$4, %r11
+	addq	64(%rbx), %r11
+	movq	72(%rbx), %r14
+	movq	%r11, 40(%r14,%r9)
+	testl	%r10d, %r10d
+	jle	.LBB106_10
+# %bb.13:                               # %for.body59.preheader
+                                        #   in Loop: Header=BB106_9 Depth=1
+	xorl	%r10d, %r10d
+	xorl	%r11d, %r11d
+	.p2align	4, 0x90
+.LBB106_14:                             # %for.body59
+                                        #   Parent Loop BB106_9 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	(%rsi,%r8,8), %r14
+	movq	40(%r14), %r14
+	movq	72(%rbx), %r15
+	movq	40(%r15,%r9), %r15
+	vmovups	(%r14,%r10), %xmm0
+	vmovups	%xmm0, (%r15,%r10)
+	incq	%r11
+	movslq	60(%rbx), %r14
+	addq	$16, %r10
+	cmpq	%r14, %r11
+	jl	.LBB106_14
+	jmp	.LBB106_10
+.LBB106_11:                             # %for.cond.cleanup39
+	movb	%dl, 88(%rbx)
+	movq	%rcx, 80(%rbx)
+.LBB106_12:                             # %cleanup
+	testq	%rax, %rax
+	setne	%al
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	vzeroupper
+	retq
+.Lfunc_end106:
+	.size	_ZN6Halide7Runtime8Internal10CacheEntry4initEPKhmjPK15halide_buffer_tiPPS5_by, .Lfunc_end106-_ZN6Halide7Runtime8Internal10CacheEntry4initEPKhmjPK15halide_buffer_tiPPS5_by
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal8djb_hashEPKhm,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal8djb_hashEPKhm # -- Begin function _ZN6Halide7Runtime8Internal8djb_hashEPKhm
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal8djb_hashEPKhm,@function
+_ZN6Halide7Runtime8Internal8djb_hashEPKhm: # @_ZN6Halide7Runtime8Internal8djb_hashEPKhm
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movl	$5381, %eax                     # imm = 0x1505
+	testq	%rsi, %rsi
+	je	.LBB107_3
+# %bb.1:                                # %for.body.preheader
+	xorl	%ecx, %ecx
+	.p2align	4, 0x90
+.LBB107_2:                              # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	movl	%eax, %edx
+	shll	$5, %edx
+	addl	%eax, %edx
+	movzbl	(%rdi,%rcx), %eax
+	addl	%edx, %eax
+	incq	%rcx
+	cmpq	%rcx, %rsi
+	jne	.LBB107_2
+.LBB107_3:                              # %for.cond.cleanup
+	popq	%rbp
+	retq
+.Lfunc_end107:
+	.size	_ZN6Halide7Runtime8Internal8djb_hashEPKhm, .Lfunc_end107-_ZN6Halide7Runtime8Internal8djb_hashEPKhm
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal11prune_cacheEv,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal11prune_cacheEv # -- Begin function _ZN6Halide7Runtime8Internal11prune_cacheEv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal11prune_cacheEv,@function
+_ZN6Halide7Runtime8Internal11prune_cacheEv: # @_ZN6Halide7Runtime8Internal11prune_cacheEv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	_ZN6Halide7Runtime8Internal18current_cache_sizeE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	movq	_ZN6Halide7Runtime8Internal14max_cache_sizeE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rcx
+	cmpq	%rcx, %rax
+	jle	.LBB108_24
+# %bb.1:                                # %entry
+	movq	_ZN6Halide7Runtime8Internal19least_recently_usedE@GOTPCREL(%rip), %rdx
+	movq	(%rdx), %rbx
+	testq	%rbx, %rbx
+	je	.LBB108_24
+# %bb.2:                                # %while.body.preheader
+	movq	_ZN6Halide7Runtime8Internal13cache_entriesE@GOTPCREL(%rip), %rsi
+	.p2align	4, 0x90
+.LBB108_3:                              # %while.body
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB108_6 Depth 2
+                                        #     Child Loop BB108_18 Depth 2
+                                        #       Child Loop BB108_26 Depth 3
+                                        #       Child Loop BB108_30 Depth 3
+	movq	8(%rbx), %r13
+	cmpl	$0, 52(%rbx)
+	jne	.LBB108_22
+# %bb.4:                                # %if.then
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movzbl	48(%rbx), %ecx
+	movq	(%rsi,%rcx,8), %rdx
+	cmpq	%rbx, %rdx
+	je	.LBB108_5
+	.p2align	4, 0x90
+.LBB108_6:                              # %while.cond9
+                                        #   Parent Loop BB108_3 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	testq	%rdx, %rdx
+	je	.LBB108_35
+# %bb.7:                                # %land.rhs11
+                                        #   in Loop: Header=BB108_6 Depth=2
+	movq	%rdx, %rcx
+	movq	(%rdx), %rdx
+	cmpq	%rbx, %rdx
+	jne	.LBB108_6
+	jmp	.LBB108_8
+.LBB108_5:                              #   in Loop: Header=BB108_3 Depth=1
+	leaq	(%rsi,%rcx,8), %rcx
+.LBB108_8:                              # %if.end21
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movq	(%rbx), %rdx
+	movq	%rdx, (%rcx)
+	movq	_ZN6Halide7Runtime8Internal19least_recently_usedE@GOTPCREL(%rip), %rcx
+	cmpq	%rbx, (%rcx)
+	je	.LBB108_9
+# %bb.10:                               # %if.end24
+                                        #   in Loop: Header=BB108_3 Depth=1
+	testq	%r13, %r13
+	je	.LBB108_12
+.LBB108_11:                             # %if.then26
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movq	16(%rbx), %rcx
+	movq	%rcx, 16(%r13)
+.LBB108_12:                             # %if.end28
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movq	16(%rbx), %rcx
+	movq	_ZN6Halide7Runtime8Internal18most_recently_usedE@GOTPCREL(%rip), %rdx
+	cmpq	%rbx, (%rdx)
+	je	.LBB108_13
+# %bb.14:                               # %if.end32
+                                        #   in Loop: Header=BB108_3 Depth=1
+	testq	%rcx, %rcx
+	je	.LBB108_16
+.LBB108_15:                             # %if.then35
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movq	%r13, 16(%rbx)
+.LBB108_16:                             # %if.end37
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movl	56(%rbx), %ecx
+	testq	%rcx, %rcx
+	je	.LBB108_21
+# %bb.17:                               # %for.body.lr.ph
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movq	72(%rbx), %rdx
+	xorl	%esi, %esi
+	jmp	.LBB108_18
+	.p2align	4, 0x90
+.LBB108_33:                             # %_ZNK15halide_buffer_t12begin_offsetEv.exit.loopexit.i
+                                        #   in Loop: Header=BB108_18 Depth=2
+	notq	%r10
+	addq	%r10, %r11
+.LBB108_34:                             # %_ZNK15halide_buffer_t13size_in_bytesEv.exit
+                                        #   in Loop: Header=BB108_18 Depth=2
+	movzbl	33(%rdx,%rdi), %edi
+	addq	$7, %rdi
+	shrq	$3, %rdi
+	imulq	%r11, %rdi
+	addq	%rdi, %rax
+	incq	%rsi
+	cmpq	%rcx, %rsi
+	je	.LBB108_20
+.LBB108_18:                             # %for.body
+                                        #   Parent Loop BB108_3 Depth=1
+                                        # =>  This Loop Header: Depth=2
+                                        #       Child Loop BB108_26 Depth 3
+                                        #       Child Loop BB108_30 Depth 3
+	imulq	$56, %rsi, %rdi
+	movl	36(%rdx,%rdi), %r8d
+	testl	%r8d, %r8d
+	jle	.LBB108_19
+# %bb.25:                               # %for.body.lr.ph.i.i
+                                        #   in Loop: Header=BB108_18 Depth=2
+	movq	40(%rdx,%rdi), %r9
+	shlq	$4, %r8
+	xorl	%r11d, %r11d
+	xorl	%r10d, %r10d
+	jmp	.LBB108_26
+	.p2align	4, 0x90
+.LBB108_28:                             # %if.end.i.i
+                                        #   in Loop: Header=BB108_26 Depth=3
+	addq	$16, %r11
+	cmpq	%r11, %r8
+	je	.LBB108_29
+.LBB108_26:                             # %for.body.i.i
+                                        #   Parent Loop BB108_3 Depth=1
+                                        #     Parent Loop BB108_18 Depth=2
+                                        # =>    This Inner Loop Header: Depth=3
+	movl	8(%r9,%r11), %r15d
+	testl	%r15d, %r15d
+	jle	.LBB108_28
+# %bb.27:                               # %if.then.i.i
+                                        #   in Loop: Header=BB108_26 Depth=3
+	movslq	4(%r9,%r11), %r12
+	decq	%r12
+	imulq	%r15, %r12
+	addq	%r12, %r10
+	jmp	.LBB108_28
+	.p2align	4, 0x90
+.LBB108_29:                             # %for.body.i12.i.preheader
+                                        #   in Loop: Header=BB108_18 Depth=2
+	xorl	%r15d, %r15d
+	xorl	%r11d, %r11d
+	jmp	.LBB108_30
+	.p2align	4, 0x90
+.LBB108_32:                             # %if.end.i23.i
+                                        #   in Loop: Header=BB108_30 Depth=3
+	addq	$16, %r15
+	cmpq	%r15, %r8
+	je	.LBB108_33
+.LBB108_30:                             # %for.body.i12.i
+                                        #   Parent Loop BB108_3 Depth=1
+                                        #     Parent Loop BB108_18 Depth=2
+                                        # =>    This Inner Loop Header: Depth=3
+	movslq	8(%r9,%r15), %r12
+	testq	%r12, %r12
+	jns	.LBB108_32
+# %bb.31:                               # %if.then.i19.i
+                                        #   in Loop: Header=BB108_30 Depth=3
+	movslq	4(%r9,%r15), %r14
+	decq	%r14
+	imulq	%r12, %r14
+	addq	%r14, %r11
+	jmp	.LBB108_32
+	.p2align	4, 0x90
+.LBB108_19:                             #   in Loop: Header=BB108_18 Depth=2
+	movq	$-1, %r11
+	jmp	.LBB108_34
+	.p2align	4, 0x90
+.LBB108_20:                             # %for.cond.for.cond.cleanup_crit_edge
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movq	_ZN6Halide7Runtime8Internal18current_cache_sizeE@GOTPCREL(%rip), %rcx
+	movq	%rax, (%rcx)
+.LBB108_21:                             # %for.cond.cleanup
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movq	%rbx, %rdi
+	callq	_ZN6Halide7Runtime8Internal10CacheEntry7destroyEv@PLT
+	xorl	%edi, %edi
+	movq	%rbx, %rsi
+	callq	halide_free@PLT
+	movq	_ZN6Halide7Runtime8Internal18current_cache_sizeE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	movq	_ZN6Halide7Runtime8Internal14max_cache_sizeE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rcx
+	movq	_ZN6Halide7Runtime8Internal13cache_entriesE@GOTPCREL(%rip), %rsi
+.LBB108_22:                             # %if.end41
+                                        #   in Loop: Header=BB108_3 Depth=1
+	cmpq	%rcx, %rax
+	jle	.LBB108_24
+# %bb.23:                               # %if.end41
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movq	%r13, %rbx
+	testq	%r13, %r13
+	jne	.LBB108_3
+	jmp	.LBB108_24
+.LBB108_9:                              # %if.then23
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movq	_ZN6Halide7Runtime8Internal19least_recently_usedE@GOTPCREL(%rip), %rcx
+	movq	%r13, (%rcx)
+	testq	%r13, %r13
+	jne	.LBB108_11
+	jmp	.LBB108_12
+.LBB108_13:                             # %if.then30
+                                        #   in Loop: Header=BB108_3 Depth=1
+	movq	_ZN6Halide7Runtime8Internal18most_recently_usedE@GOTPCREL(%rip), %rdx
+	movq	%rcx, (%rdx)
+	testq	%rcx, %rcx
+	jne	.LBB108_15
+	jmp	.LBB108_16
+.LBB108_24:                             # %while.end42
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB108_35:                             # %if.then18
+	leaq	.L.str.2.42(%rip), %rsi
+	xorl	%edi, %edi
+	callq	halide_print@PLT
+	callq	abort@PLT
+.Lfunc_end108:
+	.size	_ZN6Halide7Runtime8Internal11prune_cacheEv, .Lfunc_end108-_ZN6Halide7Runtime8Internal11prune_cacheEv
+                                        # -- End function
+	.section	.text.halide_memoization_cache_set_size,"ax",@progbits
+	.weak	halide_memoization_cache_set_size # -- Begin function halide_memoization_cache_set_size
+	.p2align	4, 0x90
+	.type	halide_memoization_cache_set_size,@function
+halide_memoization_cache_set_size:      # @halide_memoization_cache_set_size
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	testq	%rdi, %rdi
+	movl	$1048576, %r14d                 # imm = 0x100000
+	cmovneq	%rdi, %r14
+	movq	_ZN6Halide7Runtime8Internal16memoization_lockE@GOTPCREL(%rip), %rbx
+	movq	%rbx, %rdi
+	callq	halide_mutex_lock@PLT
+	movq	_ZN6Halide7Runtime8Internal14max_cache_sizeE@GOTPCREL(%rip), %rax
+	movq	%r14, (%rax)
+	callq	_ZN6Halide7Runtime8Internal11prune_cacheEv@PLT
+	movq	%rbx, %rdi
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	jmp	halide_mutex_unlock@PLT         # TAILCALL
+.Lfunc_end109:
+	.size	halide_memoization_cache_set_size, .Lfunc_end109-halide_memoization_cache_set_size
+                                        # -- End function
+	.section	.text.halide_memoization_cache_lookup,"ax",@progbits
+	.weak	halide_memoization_cache_lookup # -- Begin function halide_memoization_cache_lookup
+	.p2align	4, 0x90
+	.type	halide_memoization_cache_lookup,@function
+halide_memoization_cache_lookup:        # @halide_memoization_cache_lookup
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$56, %rsp
+	movq	%r9, -64(%rbp)                  # 8-byte Spill
+	movl	%r8d, %r13d
+	movq	%rcx, %r14
+	movq	%rsi, %rax
+	movq	%rdi, -56(%rbp)                 # 8-byte Spill
+	movslq	%edx, %rsi
+	movq	%rax, -80(%rbp)                 # 8-byte Spill
+	movq	%rax, %rdi
+	movq	%rsi, -48(%rbp)                 # 8-byte Spill
+	callq	_ZN6Halide7Runtime8Internal8djb_hashEPKhm@PLT
+	movl	%eax, %r15d
+	movzbl	%r15b, %ebx
+	movq	_ZN6Halide7Runtime8Internal16memoization_lockE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	movq	_ZN6Halide7Runtime8Internal13cache_entriesE@GOTPCREL(%rip), %rax
+	movq	(%rax,%rbx,8), %r12
+	testq	%r12, %r12
+	je	.LBB110_13
+# %bb.1:                                # %while.body.lr.ph
+	testl	%r13d, %r13d
+	jle	.LBB110_31
+# %bb.2:                                # %while.body.us.preheader
+	movslq	%r13d, %rbx
+	jmp	.LBB110_3
+	.p2align	4, 0x90
+.LBB110_30:                             # %if.end73
+                                        #   in Loop: Header=BB110_31 Depth=1
+	movq	(%r12), %r12
+	testq	%r12, %r12
+	je	.LBB110_13
+.LBB110_31:                             # %while.body
+                                        # =>This Inner Loop Header: Depth=1
+	cmpl	%r15d, 48(%r12)
+	jne	.LBB110_30
+# %bb.32:                               # %land.lhs.true
+                                        #   in Loop: Header=BB110_31 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	cmpq	%rax, 32(%r12)
+	jne	.LBB110_30
+# %bb.33:                               # %land.lhs.true7
+                                        #   in Loop: Header=BB110_31 Depth=1
+	movq	40(%r12), %rdi
+	movq	-80(%rbp), %rsi                 # 8-byte Reload
+	movq	-48(%rbp), %rdx                 # 8-byte Reload
+	callq	_ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m@PLT
+	testb	%al, %al
+	je	.LBB110_30
+# %bb.34:                               # %land.lhs.true10
+                                        #   in Loop: Header=BB110_31 Depth=1
+	movq	64(%r12), %rsi
+	movq	%r14, %rdi
+	callq	_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t@PLT
+	testb	%al, %al
+	je	.LBB110_30
+# %bb.35:                               # %land.lhs.true13
+                                        #   in Loop: Header=BB110_31 Depth=1
+	cmpl	%r13d, 56(%r12)
+	jne	.LBB110_30
+.LBB110_36:                             # %if.then23
+	movq	_ZN6Halide7Runtime8Internal18most_recently_usedE@GOTPCREL(%rip), %rbx
+	cmpq	(%rbx), %r12
+	movq	-64(%rbp), %r15                 # 8-byte Reload
+	je	.LBB110_49
+# %bb.37:                               # %do.body
+	cmpq	$0, 8(%r12)
+	jne	.LBB110_39
+# %bb.38:                               # %if.then27
+	leaq	.L.str.3.43(%rip), %rsi
+	movq	-56(%rbp), %rdi                 # 8-byte Reload
+	callq	halide_print@PLT
+	callq	abort@PLT
+.LBB110_39:                             # %do.end
+	movq	16(%r12), %rax
+	testq	%rax, %rax
+	je	.LBB110_41
+# %bb.40:                               # %if.then29
+	movq	8(%r12), %rcx
+	movq	%rcx, 8(%rax)
+	movq	8(%r12), %rax
+	jmp	.LBB110_44
+.LBB110_11:                             # %for.cond.cleanup.us
+                                        #   in Loop: Header=BB110_3 Depth=1
+	testb	%al, %al
+	movl	-68(%rbp), %r13d                # 4-byte Reload
+	movq	-88(%rbp), %r14                 # 8-byte Reload
+	jne	.LBB110_36
+	.p2align	4, 0x90
+.LBB110_12:                             # %if.end73.us
+                                        #   in Loop: Header=BB110_3 Depth=1
+	movq	(%r12), %r12
+	testq	%r12, %r12
+	je	.LBB110_13
+.LBB110_3:                              # %while.body.us
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB110_9 Depth 2
+	cmpl	%r15d, 48(%r12)
+	jne	.LBB110_12
+# %bb.4:                                # %land.lhs.true.us
+                                        #   in Loop: Header=BB110_3 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	cmpq	%rax, 32(%r12)
+	jne	.LBB110_12
+# %bb.5:                                # %land.lhs.true7.us
+                                        #   in Loop: Header=BB110_3 Depth=1
+	movq	40(%r12), %rdi
+	movq	-80(%rbp), %rsi                 # 8-byte Reload
+	movq	-48(%rbp), %rdx                 # 8-byte Reload
+	callq	_ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m@PLT
+	testb	%al, %al
+	je	.LBB110_12
+# %bb.6:                                # %land.lhs.true10.us
+                                        #   in Loop: Header=BB110_3 Depth=1
+	movq	64(%r12), %rsi
+	movq	%r14, %rdi
+	callq	_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t@PLT
+	testb	%al, %al
+	je	.LBB110_12
+# %bb.7:                                # %land.lhs.true13.us
+                                        #   in Loop: Header=BB110_3 Depth=1
+	cmpl	%r13d, 56(%r12)
+	jne	.LBB110_12
+# %bb.8:                                # %for.cond.preheader.us
+                                        #   in Loop: Header=BB110_3 Depth=1
+	movq	%r14, -88(%rbp)                 # 8-byte Spill
+	movl	%r13d, -68(%rbp)                # 4-byte Spill
+	movl	$1, %r13d
+	movl	$5, %r14d
+	.p2align	4, 0x90
+.LBB110_9:                              # %for.body.us
+                                        #   Parent Loop BB110_3 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	-64(%rbp), %rax                 # 8-byte Reload
+	movq	-8(%rax,%r13,8), %rdi
+	movq	72(%r12), %rax
+	movq	(%rax,%r14,8), %rsi
+	callq	_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t@PLT
+	testb	%al, %al
+	je	.LBB110_11
+# %bb.10:                               # %for.body.us
+                                        #   in Loop: Header=BB110_9 Depth=2
+	leaq	1(%r13), %rcx
+	addq	$7, %r14
+	cmpq	%rbx, %r13
+	movq	%rcx, %r13
+	jl	.LBB110_9
+	jmp	.LBB110_11
+.LBB110_13:                             # %for.cond75.preheader
+	movl	$1, %ebx
+	testl	%r13d, %r13d
+	jle	.LBB110_55
+# %bb.14:                               # %for.body78.preheader
+	movl	%r13d, %eax
+	movq	%rax, -48(%rbp)                 # 8-byte Spill
+	movl	$1, %r12d
+	movl	$4294967295, %r13d              # imm = 0xFFFFFFFF
+	xorl	%r14d, %r14d
+	.p2align	4, 0x90
+.LBB110_15:                             # %for.body78
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB110_17 Depth 2
+                                        #     Child Loop BB110_21 Depth 2
+	movq	-64(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax,%r14,8), %rbx
+	movl	36(%rbx), %ecx
+	movl	$1, %eax
+	testl	%ecx, %ecx
+	jle	.LBB110_25
+# %bb.16:                               # %for.body.lr.ph.i.i
+                                        #   in Loop: Header=BB110_15 Depth=1
+	movq	40(%rbx), %rdx
+	shlq	$4, %rcx
+	xorl	%esi, %esi
+	xorl	%eax, %eax
+	jmp	.LBB110_17
+	.p2align	4, 0x90
+.LBB110_19:                             # %if.end.i.i
+                                        #   in Loop: Header=BB110_17 Depth=2
+	addq	$16, %rsi
+	cmpq	%rsi, %rcx
+	je	.LBB110_20
+.LBB110_17:                             # %for.body.i.i
+                                        #   Parent Loop BB110_15 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movl	8(%rdx,%rsi), %edi
+	testl	%edi, %edi
+	jle	.LBB110_19
+# %bb.18:                               # %if.then.i.i
+                                        #   in Loop: Header=BB110_17 Depth=2
+	movslq	4(%rdx,%rsi), %r8
+	decq	%r8
+	imulq	%rdi, %r8
+	addq	%r8, %rax
+	jmp	.LBB110_19
+	.p2align	4, 0x90
+.LBB110_20:                             # %for.body.i12.i.preheader
+                                        #   in Loop: Header=BB110_15 Depth=1
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	jmp	.LBB110_21
+	.p2align	4, 0x90
+.LBB110_23:                             # %if.end.i23.i
+                                        #   in Loop: Header=BB110_21 Depth=2
+	addq	$16, %rdi
+	cmpq	%rdi, %rcx
+	je	.LBB110_24
+.LBB110_21:                             # %for.body.i12.i
+                                        #   Parent Loop BB110_15 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movslq	8(%rdx,%rdi), %r8
+	testq	%r8, %r8
+	jns	.LBB110_23
+# %bb.22:                               # %if.then.i19.i
+                                        #   in Loop: Header=BB110_21 Depth=2
+	movslq	4(%rdx,%rdi), %r9
+	decq	%r9
+	imulq	%r8, %r9
+	addq	%r9, %rsi
+	jmp	.LBB110_23
+	.p2align	4, 0x90
+.LBB110_24:                             # %_ZNK15halide_buffer_t12begin_offsetEv.exit.loopexit.i
+                                        #   in Loop: Header=BB110_15 Depth=1
+	subq	%rsi, %rax
+	incq	%rax
+.LBB110_25:                             # %_ZNK15halide_buffer_t13size_in_bytesEv.exit
+                                        #   in Loop: Header=BB110_15 Depth=1
+	movzbl	33(%rbx), %esi
+	addq	$7, %rsi
+	shrq	$3, %rsi
+	imulq	%rax, %rsi
+	addq	$64, %rsi
+	movq	-56(%rbp), %rdi                 # 8-byte Reload
+	callq	halide_malloc@PLT
+	movq	%rax, 16(%rbx)
+	testq	%rax, %rax
+	je	.LBB110_26
+# %bb.53:                               # %for.inc114
+                                        #   in Loop: Header=BB110_15 Depth=1
+	addq	$64, %rax
+	movq	%rax, 16(%rbx)
+	movq	%rax, %rdi
+	callq	_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh@PLT
+	movl	%r15d, 8(%rax)
+	movq	$0, (%rax)
+	incq	%r14
+	incq	%r12
+	incq	%r13
+	cmpq	-48(%rbp), %r14                 # 8-byte Folded Reload
+	jne	.LBB110_15
+# %bb.54:
+	movl	$1, %ebx
+	jmp	.LBB110_55
+.LBB110_26:                             # %for.cond89.preheader
+	movl	$-1, %ebx
+	testq	%r14, %r14
+	je	.LBB110_55
+# %bb.27:                               # %for.body92.preheader
+	movq	-64(%rbp), %r14                 # 8-byte Reload
+	movq	-56(%rbp), %r15                 # 8-byte Reload
+	.p2align	4, 0x90
+.LBB110_28:                             # %for.body92
+                                        # =>This Inner Loop Header: Depth=1
+	movl	%r13d, %ebx
+	movq	(%r14,%rbx,8), %rax
+	movq	16(%rax), %rdi
+	callq	_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh@PLT
+	movq	%r15, %rdi
+	movq	%rax, %rsi
+	callq	halide_free@PLT
+	movq	(%r14,%rbx,8), %rax
+	movq	$0, 16(%rax)
+	decq	%r12
+	decq	%r13
+	cmpq	$1, %r12
+	jg	.LBB110_28
+# %bb.29:
+	movl	$-1, %ebx
+.LBB110_55:                             # %cleanup119
+	movq	_ZN6Halide7Runtime8Internal16memoization_lockE@GOTPCREL(%rip), %rdi
+	vzeroupper
+	callq	halide_mutex_unlock@PLT
+	movl	%ebx, %eax
+	addq	$56, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB110_41:                             # %do.body33
+	movq	_ZN6Halide7Runtime8Internal19least_recently_usedE@GOTPCREL(%rip), %r14
+	cmpq	%r12, (%r14)
+	je	.LBB110_43
+# %bb.42:                               # %if.then35
+	leaq	.L.str.4.44(%rip), %rsi
+	movq	-56(%rbp), %rdi                 # 8-byte Reload
+	callq	halide_print@PLT
+	callq	abort@PLT
+.LBB110_43:                             # %do.end38
+	movq	8(%r12), %rax
+	movq	%rax, (%r14)
+.LBB110_44:                             # %do.body41
+	testq	%rax, %rax
+	jne	.LBB110_46
+# %bb.45:                               # %if.then44
+	leaq	.L.str.5.45(%rip), %rsi
+	movq	-56(%rbp), %rdi                 # 8-byte Reload
+	callq	halide_print@PLT
+	callq	abort@PLT
+	movq	8(%r12), %rax
+.LBB110_46:                             # %do.end47
+	movq	16(%r12), %rcx
+	movq	%rcx, 16(%rax)
+	movq	$0, 8(%r12)
+	movq	(%rbx), %rax
+	movq	%rax, 16(%r12)
+	testq	%rax, %rax
+	je	.LBB110_48
+# %bb.47:                               # %if.then54
+	movq	%r12, 8(%rax)
+.LBB110_48:                             # %if.end56
+	movq	%r12, (%rbx)
+.LBB110_49:                             # %if.end57
+	testl	%r13d, %r13d
+	jle	.LBB110_52
+# %bb.50:                               # %for.body62.lr.ph
+	movl	%r13d, %ecx
+	leaq	(,%rcx,8), %rax
+	subq	%rcx, %rax
+	xorl	%ecx, %ecx
+	.p2align	4, 0x90
+.LBB110_51:                             # %for.body62
+                                        # =>This Inner Loop Header: Depth=1
+	movq	(%r15), %rdx
+	movq	72(%r12), %rsi
+	vmovups	(%rsi,%rcx,8), %ymm0
+	vmovups	24(%rsi,%rcx,8), %ymm1
+	vmovups	%ymm1, 24(%rdx)
+	vmovups	%ymm0, (%rdx)
+	addq	$7, %rcx
+	addq	$8, %r15
+	cmpq	%rcx, %rax
+	jne	.LBB110_51
+.LBB110_52:                             # %cleanup119.loopexit211
+	addl	%r13d, 52(%r12)
+	xorl	%ebx, %ebx
+	jmp	.LBB110_55
+.Lfunc_end110:
+	.size	halide_memoization_cache_lookup, .Lfunc_end110-halide_memoization_cache_lookup
+                                        # -- End function
+	.section	.text.halide_memoization_cache_store,"ax",@progbits
+	.weak	halide_memoization_cache_store  # -- Begin function halide_memoization_cache_store
+	.p2align	4, 0x90
+	.type	halide_memoization_cache_store,@function
+halide_memoization_cache_store:         # @halide_memoization_cache_store
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$72, %rsp
+	movq	%r9, %r12
+	movl	%r8d, -44(%rbp)                 # 4-byte Spill
+	movq	%rcx, -64(%rbp)                 # 8-byte Spill
+	movl	%edx, %r14d
+	movq	%rsi, -72(%rbp)                 # 8-byte Spill
+	movq	%rdi, -80(%rbp)                 # 8-byte Spill
+	movq	(%r9), %rax
+	movq	16(%rax), %rdi
+	callq	_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh@PLT
+	movl	8(%rax), %r15d
+	movzbl	%r15b, %ebx
+	movq	_ZN6Halide7Runtime8Internal16memoization_lockE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	movq	_ZN6Halide7Runtime8Internal13cache_entriesE@GOTPCREL(%rip), %rax
+	movq	%rbx, -104(%rbp)                # 8-byte Spill
+	movq	(%rax,%rbx,8), %r13
+	movslq	%r14d, %rax
+	movq	%rax, -56(%rbp)                 # 8-byte Spill
+	movq	%r15, %r14
+	testq	%r13, %r13
+	je	.LBB111_13
+# %bb.1:                                # %while.body.lr.ph
+	movl	-44(%rbp), %eax                 # 4-byte Reload
+	testl	%eax, %eax
+	jle	.LBB111_15
+# %bb.2:                                # %while.body.us.preheader
+	cltq
+	movq	%rax, -112(%rbp)                # 8-byte Spill
+	jmp	.LBB111_3
+	.p2align	4, 0x90
+.LBB111_20:                             # %if.end59
+                                        #   in Loop: Header=BB111_15 Depth=1
+	movq	(%r13), %r13
+	testq	%r13, %r13
+	je	.LBB111_13
+.LBB111_15:                             # %while.body
+                                        # =>This Inner Loop Header: Depth=1
+	cmpl	%r14d, 48(%r13)
+	jne	.LBB111_20
+# %bb.16:                               # %land.lhs.true
+                                        #   in Loop: Header=BB111_15 Depth=1
+	movq	-56(%rbp), %rax                 # 8-byte Reload
+	cmpq	%rax, 32(%r13)
+	jne	.LBB111_20
+# %bb.17:                               # %land.lhs.true12
+                                        #   in Loop: Header=BB111_15 Depth=1
+	movq	40(%r13), %rdi
+	movq	-72(%rbp), %rsi                 # 8-byte Reload
+	movq	-56(%rbp), %rdx                 # 8-byte Reload
+	callq	_ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m@PLT
+	testb	%al, %al
+	je	.LBB111_20
+# %bb.18:                               # %land.lhs.true15
+                                        #   in Loop: Header=BB111_15 Depth=1
+	movq	64(%r13), %rsi
+	movq	-64(%rbp), %rdi                 # 8-byte Reload
+	callq	_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t@PLT
+	testb	%al, %al
+	je	.LBB111_20
+# %bb.19:                               # %land.lhs.true18
+                                        #   in Loop: Header=BB111_15 Depth=1
+	movl	-44(%rbp), %eax                 # 4-byte Reload
+	cmpl	%eax, 56(%r13)
+	jne	.LBB111_20
+	jmp	.LBB111_52
+.LBB111_11:                             # %for.cond.cleanup.us
+                                        #   in Loop: Header=BB111_3 Depth=1
+	testb	%al, %al
+	movq	-88(%rbp), %r12                 # 8-byte Reload
+	movq	-96(%rbp), %r14                 # 8-byte Reload
+	jne	.LBB111_21
+	.p2align	4, 0x90
+.LBB111_12:                             # %if.end59.us
+                                        #   in Loop: Header=BB111_3 Depth=1
+	movq	(%r13), %r13
+	testq	%r13, %r13
+	je	.LBB111_13
+.LBB111_3:                              # %while.body.us
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB111_9 Depth 2
+	cmpl	%r14d, 48(%r13)
+	jne	.LBB111_12
+# %bb.4:                                # %land.lhs.true.us
+                                        #   in Loop: Header=BB111_3 Depth=1
+	movq	-56(%rbp), %rax                 # 8-byte Reload
+	cmpq	%rax, 32(%r13)
+	jne	.LBB111_12
+# %bb.5:                                # %land.lhs.true12.us
+                                        #   in Loop: Header=BB111_3 Depth=1
+	movq	40(%r13), %rdi
+	movq	-72(%rbp), %rsi                 # 8-byte Reload
+	movq	-56(%rbp), %rdx                 # 8-byte Reload
+	callq	_ZN6Halide7Runtime8Internal10keys_equalEPKhS3_m@PLT
+	testb	%al, %al
+	je	.LBB111_12
+# %bb.6:                                # %land.lhs.true15.us
+                                        #   in Loop: Header=BB111_3 Depth=1
+	movq	64(%r13), %rsi
+	movq	-64(%rbp), %rdi                 # 8-byte Reload
+	callq	_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t@PLT
+	testb	%al, %al
+	je	.LBB111_12
+# %bb.7:                                # %land.lhs.true18.us
+                                        #   in Loop: Header=BB111_3 Depth=1
+	movl	-44(%rbp), %eax                 # 4-byte Reload
+	cmpl	%eax, 56(%r13)
+	jne	.LBB111_12
+# %bb.8:                                # %for.cond.preheader.us
+                                        #   in Loop: Header=BB111_3 Depth=1
+	movq	%r14, -96(%rbp)                 # 8-byte Spill
+	movq	%r12, -88(%rbp)                 # 8-byte Spill
+	movq	72(%r13), %rcx
+	movb	$1, %r15b
+	movl	$1, %ebx
+	xorl	%r12d, %r12d
+	.p2align	4, 0x90
+.LBB111_9:                              # %for.body.us
+                                        #   Parent Loop BB111_3 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	-88(%rbp), %rax                 # 8-byte Reload
+	movq	-8(%rax,%rbx,8), %r14
+	movq	40(%rcx,%r12), %rsi
+	movq	%r14, %rdi
+	callq	_ZN6Halide7Runtime8Internal16buffer_has_shapeEPK15halide_buffer_tPK18halide_dimension_t@PLT
+	movq	72(%r13), %rcx
+	movq	16(%rcx,%r12), %rdx
+	cmpq	16(%r14), %rdx
+	movzbl	%r15b, %r15d
+	movl	$0, %edx
+	cmovel	%edx, %r15d
+	testb	%al, %al
+	je	.LBB111_11
+# %bb.10:                               # %for.body.us
+                                        #   in Loop: Header=BB111_9 Depth=2
+	leaq	1(%rbx), %rdx
+	addq	$56, %r12
+	cmpq	-112(%rbp), %rbx                # 8-byte Folded Reload
+	movq	%rdx, %rbx
+	jl	.LBB111_9
+	jmp	.LBB111_11
+.LBB111_13:                             # %for.cond61.preheader
+	movl	-44(%rbp), %eax                 # 4-byte Reload
+	testl	%eax, %eax
+	jle	.LBB111_14
+# %bb.25:                               # %for.body64.preheader
+	movl	%eax, %eax
+	xorl	%ecx, %ecx
+	xorl	%r13d, %r13d
+	jmp	.LBB111_26
+	.p2align	4, 0x90
+.LBB111_36:                             # %_ZNK15halide_buffer_t12begin_offsetEv.exit.loopexit.i
+                                        #   in Loop: Header=BB111_26 Depth=1
+	subq	%r9, %rsi
+	incq	%rsi
+.LBB111_37:                             # %_ZNK15halide_buffer_t13size_in_bytesEv.exit
+                                        #   in Loop: Header=BB111_26 Depth=1
+	movzbl	33(%rdx), %edx
+	addq	$7, %rdx
+	shrq	$3, %rdx
+	imulq	%rsi, %rdx
+	addq	%rdx, %r13
+	incq	%rcx
+	cmpq	%rax, %rcx
+	je	.LBB111_38
+.LBB111_26:                             # %for.body64
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB111_29 Depth 2
+                                        #     Child Loop BB111_33 Depth 2
+	movq	(%r12,%rcx,8), %rdx
+	movl	36(%rdx), %edi
+	testl	%edi, %edi
+	jle	.LBB111_27
+# %bb.28:                               # %for.body.lr.ph.i.i
+                                        #   in Loop: Header=BB111_26 Depth=1
+	movq	40(%rdx), %r8
+	shlq	$4, %rdi
+	xorl	%r9d, %r9d
+	xorl	%esi, %esi
+	jmp	.LBB111_29
+	.p2align	4, 0x90
+.LBB111_31:                             # %if.end.i.i
+                                        #   in Loop: Header=BB111_29 Depth=2
+	addq	$16, %r9
+	cmpq	%r9, %rdi
+	je	.LBB111_32
+.LBB111_29:                             # %for.body.i.i
+                                        #   Parent Loop BB111_26 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movl	8(%r8,%r9), %r10d
+	testl	%r10d, %r10d
+	jle	.LBB111_31
+# %bb.30:                               # %if.then.i.i
+                                        #   in Loop: Header=BB111_29 Depth=2
+	movslq	4(%r8,%r9), %r11
+	decq	%r11
+	imulq	%r10, %r11
+	addq	%r11, %rsi
+	jmp	.LBB111_31
+	.p2align	4, 0x90
+.LBB111_32:                             # %for.body.i12.i.preheader
+                                        #   in Loop: Header=BB111_26 Depth=1
+	xorl	%r10d, %r10d
+	xorl	%r9d, %r9d
+	jmp	.LBB111_33
+	.p2align	4, 0x90
+.LBB111_35:                             # %if.end.i23.i
+                                        #   in Loop: Header=BB111_33 Depth=2
+	addq	$16, %r10
+	cmpq	%r10, %rdi
+	je	.LBB111_36
+.LBB111_33:                             # %for.body.i12.i
+                                        #   Parent Loop BB111_26 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movslq	8(%r8,%r10), %r11
+	testq	%r11, %r11
+	jns	.LBB111_35
+# %bb.34:                               # %if.then.i19.i
+                                        #   in Loop: Header=BB111_33 Depth=2
+	movslq	4(%r8,%r10), %rbx
+	decq	%rbx
+	imulq	%r11, %rbx
+	addq	%rbx, %r9
+	jmp	.LBB111_35
+	.p2align	4, 0x90
+.LBB111_27:                             #   in Loop: Header=BB111_26 Depth=1
+	movl	$1, %esi
+	jmp	.LBB111_37
+.LBB111_14:
+	xorl	%r13d, %r13d
+.LBB111_38:                             # %for.cond.cleanup63
+	movq	%r14, %rbx
+	movq	_ZN6Halide7Runtime8Internal18current_cache_sizeE@GOTPCREL(%rip), %r15
+	addq	%r13, (%r15)
+	callq	_ZN6Halide7Runtime8Internal11prune_cacheEv@PLT
+	movl	$96, %esi
+	xorl	%edi, %edi
+	callq	halide_malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB111_40
+# %bb.39:                               # %if.then76
+	movzbl	16(%rbp), %eax
+	subq	$8, %rsp
+	movzbl	%al, %eax
+	movq	%r14, %rdi
+	movq	-72(%rbp), %rsi                 # 8-byte Reload
+	movq	-56(%rbp), %rdx                 # 8-byte Reload
+	movl	%ebx, %ecx
+	movq	-64(%rbp), %r8                  # 8-byte Reload
+	movl	-44(%rbp), %r9d                 # 4-byte Reload
+	pushq	24(%rbp)
+	pushq	%rax
+	pushq	%r12
+	callq	_ZN6Halide7Runtime8Internal10CacheEntry4initEPKhmjPK15halide_buffer_tiPPS5_by@PLT
+	addq	$32, %rsp
+	testb	%al, %al
+	je	.LBB111_40
+# %bb.45:                               # %if.end101
+	movq	-104(%rbp), %rdx                # 8-byte Reload
+	movq	_ZN6Halide7Runtime8Internal13cache_entriesE@GOTPCREL(%rip), %rsi
+	movq	(%rsi,%rdx,8), %rax
+	movq	%rax, (%r14)
+	movq	_ZN6Halide7Runtime8Internal18most_recently_usedE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rcx
+	movq	%rcx, 16(%r14)
+	testq	%rcx, %rcx
+	je	.LBB111_47
+# %bb.46:                               # %if.then106
+	movq	%r14, 8(%rcx)
+.LBB111_47:                             # %if.end107
+	movq	%r14, (%rax)
+	movq	_ZN6Halide7Runtime8Internal19least_recently_usedE@GOTPCREL(%rip), %rax
+	cmpq	$0, (%rax)
+	movl	-44(%rbp), %ecx                 # 4-byte Reload
+	jne	.LBB111_49
+# %bb.48:                               # %if.then109
+	movq	%r14, (%rax)
+.LBB111_49:                             # %if.end110
+	movq	%r14, (%rsi,%rdx,8)
+	movl	%ecx, 52(%r14)
+	testl	%ecx, %ecx
+	jle	.LBB111_52
+# %bb.50:                               # %for.body117.preheader
+	movl	-44(%rbp), %ebx                 # 4-byte Reload
+	xorl	%r15d, %r15d
+	.p2align	4, 0x90
+.LBB111_51:                             # %for.body117
+                                        # =>This Inner Loop Header: Depth=1
+	movq	(%r12,%r15,8), %rax
+	movq	16(%rax), %rdi
+	callq	_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh@PLT
+	movq	%r14, (%rax)
+	incq	%r15
+	cmpq	%r15, %rbx
+	jne	.LBB111_51
+	jmp	.LBB111_52
+.LBB111_40:                             # %if.then83
+	subq	%r13, (%r15)
+	movl	-44(%rbp), %eax                 # 4-byte Reload
+	testl	%eax, %eax
+	jle	.LBB111_43
+# %bb.41:                               # %for.body88.preheader
+	movl	%eax, %ebx
+	xorl	%r15d, %r15d
+	.p2align	4, 0x90
+.LBB111_42:                             # %for.body88
+                                        # =>This Inner Loop Header: Depth=1
+	movq	(%r12,%r15,8), %rax
+	movq	16(%rax), %rdi
+	callq	_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh@PLT
+	movq	$0, (%rax)
+	incq	%r15
+	cmpq	%r15, %rbx
+	jne	.LBB111_42
+.LBB111_43:                             # %for.cond.cleanup87
+	testq	%r14, %r14
+	je	.LBB111_52
+# %bb.44:                               # %if.then99
+	movq	-80(%rbp), %rdi                 # 8-byte Reload
+	movq	%r14, %rsi
+	callq	halide_free@PLT
+.LBB111_52:                             # %cleanup132
+	movq	_ZN6Halide7Runtime8Internal16memoization_lockE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_unlock@PLT
+	xorl	%eax, %eax
+	addq	$72, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB111_21:                             # %do.body
+	testb	$1, %r15b
+	jne	.LBB111_23
+# %bb.22:                               # %if.then42
+	leaq	.L.str.9.46(%rip), %rsi
+	movq	-80(%rbp), %rdi                 # 8-byte Reload
+	callq	halide_print@PLT
+	callq	abort@PLT
+.LBB111_23:                             # %for.body48.preheader
+	movl	-44(%rbp), %ebx                 # 4-byte Reload
+	xorl	%r14d, %r14d
+	.p2align	4, 0x90
+.LBB111_24:                             # %for.body48
+                                        # =>This Inner Loop Header: Depth=1
+	movq	(%r12,%r14,8), %rax
+	movq	16(%rax), %rdi
+	callq	_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh@PLT
+	movq	$0, (%rax)
+	incq	%r14
+	cmpq	%r14, %rbx
+	jne	.LBB111_24
+	jmp	.LBB111_52
+.Lfunc_end111:
+	.size	halide_memoization_cache_store, .Lfunc_end111-halide_memoization_cache_store
+                                        # -- End function
+	.section	.text.halide_memoization_cache_release,"ax",@progbits
+	.weak	halide_memoization_cache_release # -- Begin function halide_memoization_cache_release
+	.p2align	4, 0x90
+	.type	halide_memoization_cache_release,@function
+halide_memoization_cache_release:       # @halide_memoization_cache_release
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movq	%rsi, %rdi
+	callq	_ZN6Halide7Runtime8Internal21get_pointer_to_headerEPh@PLT
+	movq	(%rax), %r14
+	testq	%r14, %r14
+	je	.LBB112_4
+# %bb.1:                                # %if.else
+	movq	_ZN6Halide7Runtime8Internal16memoization_lockE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	movl	52(%r14), %eax
+	testl	%eax, %eax
+	jne	.LBB112_3
+# %bb.2:                                # %if.then4
+	leaq	.L.str.12.47(%rip), %rsi
+	movq	%rbx, %rdi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	movl	52(%r14), %eax
+.LBB112_3:                              # %do.end
+	decl	%eax
+	movl	%eax, 52(%r14)
+	movq	_ZN6Halide7Runtime8Internal16memoization_lockE@GOTPCREL(%rip), %rdi
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	jmp	halide_mutex_unlock@PLT         # TAILCALL
+.LBB112_4:                              # %if.then
+	movq	%rbx, %rdi
+	movq	%rax, %rsi
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	jmp	halide_free@PLT                 # TAILCALL
+.Lfunc_end112:
+	.size	halide_memoization_cache_release, .Lfunc_end112-halide_memoization_cache_release
+                                        # -- End function
+	.section	.text.halide_memoization_cache_evict,"ax",@progbits
+	.weak	halide_memoization_cache_evict  # -- Begin function halide_memoization_cache_evict
+	.p2align	4, 0x90
+	.type	halide_memoization_cache_evict,@function
+halide_memoization_cache_evict:         # @halide_memoization_cache_evict
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$24, %rsp
+	movq	%rsi, %rbx
+	movq	%rdi, -56(%rbp)                 # 8-byte Spill
+	movq	_ZN6Halide7Runtime8Internal16memoization_lockE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	movq	_ZN6Halide7Runtime8Internal13cache_entriesE@GOTPCREL(%rip), %r14
+	movl	$2048, %eax                     # imm = 0x800
+	addq	_ZN6Halide7Runtime8Internal13cache_entriesE@GOTPCREL(%rip), %rax
+	movq	%rax, -48(%rbp)                 # 8-byte Spill
+	jmp	.LBB113_2
+	.p2align	4, 0x90
+.LBB113_1:                              # %if.end25
+                                        #   in Loop: Header=BB113_2 Depth=1
+	addq	$8, %r14
+	cmpq	-48(%rbp), %r14                 # 8-byte Folded Reload
+	je	.LBB113_15
+.LBB113_2:                              # %for.body
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB113_6 Depth 2
+	movq	(%r14), %r13
+	testq	%r13, %r13
+	je	.LBB113_1
+# %bb.3:                                # %while.body.preheader
+                                        #   in Loop: Header=BB113_2 Depth=1
+	movq	%r14, %r12
+	jmp	.LBB113_6
+	.p2align	4, 0x90
+.LBB113_4:                              #   in Loop: Header=BB113_6 Depth=2
+	movq	%r15, %r12
+	testq	%r13, %r13
+	je	.LBB113_1
+.LBB113_6:                              # %while.body
+                                        #   Parent Loop BB113_2 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	%r13, %r15
+	movq	(%r13), %r13
+	cmpb	$0, 88(%r15)
+	je	.LBB113_4
+# %bb.7:                                # %land.lhs.true
+                                        #   in Loop: Header=BB113_6 Depth=2
+	cmpq	%rbx, 80(%r15)
+	jne	.LBB113_4
+# %bb.8:                                # %if.then7
+                                        #   in Loop: Header=BB113_6 Depth=2
+	movq	%r13, (%r12)
+	movq	8(%r15), %rax
+	movq	16(%r15), %rcx
+	testq	%rax, %rax
+	je	.LBB113_12
+# %bb.9:                                # %if.then9
+                                        #   in Loop: Header=BB113_6 Depth=2
+	movq	%rcx, 16(%rax)
+	movq	16(%r15), %rcx
+	testq	%rcx, %rcx
+	jne	.LBB113_13
+.LBB113_10:                             # %if.end
+                                        #   in Loop: Header=BB113_6 Depth=2
+	movq	_ZN6Halide7Runtime8Internal19least_recently_usedE@GOTPCREL(%rip), %rcx
+	jmp	.LBB113_14
+.LBB113_12:                             # %if.else
+                                        #   in Loop: Header=BB113_6 Depth=2
+	movq	_ZN6Halide7Runtime8Internal18most_recently_usedE@GOTPCREL(%rip), %rdx
+	movq	%rcx, (%rdx)
+	testq	%rcx, %rcx
+	je	.LBB113_10
+.LBB113_13:                             #   in Loop: Header=BB113_6 Depth=2
+	addq	$8, %rcx
+.LBB113_14:                             # %if.end
+                                        #   in Loop: Header=BB113_6 Depth=2
+	movq	%rax, (%rcx)
+	movq	%r15, %rdi
+	callq	_ZN6Halide7Runtime8Internal10CacheEntry7destroyEv@PLT
+	movq	-56(%rbp), %rdi                 # 8-byte Reload
+	movq	%r15, %rsi
+	callq	halide_free@PLT
+	testq	%r13, %r13
+	jne	.LBB113_6
+	jmp	.LBB113_1
+.LBB113_15:                             # %for.cond.cleanup
+	movq	_ZN6Halide7Runtime8Internal16memoization_lockE@GOTPCREL(%rip), %rdi
+	addq	$24, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	jmp	halide_mutex_unlock@PLT         # TAILCALL
+.Lfunc_end113:
+	.size	halide_memoization_cache_evict, .Lfunc_end113-halide_memoization_cache_evict
+                                        # -- End function
+	.section	.text.halide_string_to_string,"ax",@progbits
+	.weak	halide_string_to_string         # -- Begin function halide_string_to_string
+	.p2align	4, 0x90
+	.type	halide_string_to_string,@function
+halide_string_to_string:                # @halide_string_to_string
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	cmpq	%rsi, %rdi
+	jae	.LBB114_6
+# %bb.1:                                # %if.end
+	movq	%rsi, %rax
+	testq	%rdx, %rdx
+	leaq	.L.str.50(%rip), %rcx
+	cmovneq	%rdx, %rcx
+	.p2align	4, 0x90
+.LBB114_2:                              # %if.end5
+                                        # =>This Inner Loop Header: Depth=1
+	movzbl	(%rcx), %edx
+	movb	%dl, (%rdi)
+	testb	%dl, %dl
+	je	.LBB114_6
+# %bb.3:                                # %if.end8
+                                        #   in Loop: Header=BB114_2 Depth=1
+	incq	%rdi
+	incq	%rcx
+	cmpq	%rax, %rdi
+	jne	.LBB114_2
+# %bb.4:                                # %if.then4
+	movb	$0, -1(%rdi)
+	popq	%rbp
+	retq
+.LBB114_6:
+	movq	%rdi, %rax
+	popq	%rbp
+	retq
+.Lfunc_end114:
+	.size	halide_string_to_string, .Lfunc_end114-halide_string_to_string
+                                        # -- End function
+	.section	.text.halide_uint64_to_string,"ax",@progbits
+	.weak	halide_uint64_to_string         # -- Begin function halide_uint64_to_string
+	.p2align	4, 0x90
+	.type	halide_uint64_to_string,@function
+halide_uint64_to_string:                # @halide_uint64_to_string
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$32, %rsp
+	movb	$0, -1(%rbp)
+	leaq	-2(%rbp), %rax
+	testq	%rdx, %rdx
+	jne	.LBB115_2
+# %bb.1:                                # %entry
+	testl	%ecx, %ecx
+	jle	.LBB115_5
+.LBB115_2:                              # %for.body.preheader
+	movl	$1, %r10d
+	movabsq	$-3689348814741910323, %r9      # imm = 0xCCCCCCCCCCCCCCCD
+	.p2align	4, 0x90
+.LBB115_3:                              # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rdx, %r8
+	movl	%r10d, %r11d
+	mulxq	%r9, %rdx, %rdx
+	shrq	$3, %rdx
+	imull	$246, %edx, %r10d
+	addl	%r8d, %r10d
+	addb	$48, %r10b
+	movb	%r10b, (%rax)
+	decq	%rax
+	leal	1(%r11), %r10d
+	cmpl	%ecx, %r11d
+	jl	.LBB115_3
+# %bb.4:                                # %for.body
+                                        #   in Loop: Header=BB115_3 Depth=1
+	cmpq	$9, %r8
+	ja	.LBB115_3
+.LBB115_5:                              # %for.cond.cleanup
+	incq	%rax
+	movq	%rax, %rdx
+	callq	halide_string_to_string@PLT
+	addq	$32, %rsp
+	popq	%rbp
+	retq
+.Lfunc_end115:
+	.size	halide_uint64_to_string, .Lfunc_end115-halide_uint64_to_string
+                                        # -- End function
+	.section	.text.halide_int64_to_string,"ax",@progbits
+	.weak	halide_int64_to_string          # -- Begin function halide_int64_to_string
+	.p2align	4, 0x90
+	.type	halide_int64_to_string,@function
+halide_int64_to_string:                 # @halide_int64_to_string
+# %bb.0:                                # %entry
+	cmpq	%rsi, %rdi
+	jae	.LBB116_3
+# %bb.1:                                # %entry
+	testq	%rdx, %rdx
+	jns	.LBB116_3
+# %bb.2:                                # %if.then
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movb	$45, (%rdi)
+	incq	%rdi
+	negq	%rdx
+	popq	%rbp
+.LBB116_3:                              # %if.end
+	jmp	halide_uint64_to_string@PLT     # TAILCALL
+.Lfunc_end116:
+	.size	halide_int64_to_string, .Lfunc_end116-halide_int64_to_string
+                                        # -- End function
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4, 0x0                          # -- Begin function halide_double_to_string
+.LCPI117_0:
+	.quad	0x8000000000000000              # double -0
+	.quad	0x8000000000000000              # double -0
+.LCPI117_6:
+	.long	1127219200                      # 0x43300000
+	.long	1160773632                      # 0x45300000
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+.LCPI117_7:
+	.quad	0x4330000000000000              # double 4503599627370496
+	.quad	0x4530000000000000              # double 1.9342813113834067E+25
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3, 0x0
+.LCPI117_1:
+	.quad	0x3ff0000000000000              # double 1
+.LCPI117_2:
+	.quad	0x4024000000000000              # double 10
+.LCPI117_3:
+	.quad	0x412e848000000000              # double 1.0E+6
+.LCPI117_4:
+	.quad	0x3fe0000000000000              # double 0.5
+.LCPI117_5:
+	.quad	0x43e0000000000000              # double 9.2233720368547758E+18
+	.section	.text.halide_double_to_string,"ax",@progbits
+	.weak	halide_double_to_string
+	.p2align	4, 0x90
+	.type	halide_double_to_string,@function
+halide_double_to_string:                # @halide_double_to_string
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$536, %rsp                      # imm = 0x218
+	movl	%edx, %r12d
+	movq	%rsi, %rbx
+	movq	%rdi, %r14
+	vmovsd	%xmm0, -48(%rbp)
+	movq	$0, -56(%rbp)
+	leaq	-56(%rbp), %rdi
+	leaq	-48(%rbp), %rsi
+	movl	$8, %edx
+	callq	memcpy@PLT
+	movq	-56(%rbp), %rax
+	movb	$52, %cl
+	bzhiq	%rcx, %rax, %r15
+	movq	%rax, %r13
+	shrq	$52, %r13
+	andl	$2047, %r13d                    # imm = 0x7FF
+	cmpl	$2047, %r13d                    # imm = 0x7FF
+	jne	.LBB117_9
+# %bb.1:                                # %if.then
+	testq	%r15, %r15
+	je	.LBB117_6
+# %bb.2:                                # %if.then4
+	testq	%rax, %rax
+	js	.LBB117_3
+# %bb.5:                                # %if.else
+	leaq	.L.str.2.58(%rip), %rdx
+	jmp	.LBB117_4
+.LBB117_9:                              # %if.else15
+	testl	%r13d, %r13d
+	jne	.LBB117_18
+# %bb.10:                               # %if.else15
+	testq	%r15, %r15
+	jne	.LBB117_18
+# %bb.11:                               # %if.then18
+	testl	%r12d, %r12d
+	je	.LBB117_15
+# %bb.12:                               # %if.then20
+	testq	%rax, %rax
+	js	.LBB117_13
+# %bb.14:                               # %if.else24
+	leaq	.L.str.6.62(%rip), %rdx
+	jmp	.LBB117_4
+.LBB117_18:                             # %if.end32
+	testq	%rax, %rax
+	js	.LBB117_19
+# %bb.20:                               # %if.end36
+	testl	%r12d, %r12d
+	je	.LBB117_35
+.LBB117_21:                             # %while.condthread-pre-split
+	vmovsd	-48(%rbp), %xmm0                # xmm0 = mem[0],zero
+	xorl	%r12d, %r12d
+	vmovsd	.LCPI117_1(%rip), %xmm1         # xmm1 = mem[0],zero
+	vucomisd	%xmm0, %xmm1
+	jbe	.LBB117_25
+# %bb.22:                               # %while.body.preheader
+	xorl	%r12d, %r12d
+	vmovsd	.LCPI117_2(%rip), %xmm2         # xmm2 = mem[0],zero
+	.p2align	4, 0x90
+.LBB117_23:                             # %while.body
+                                        # =>This Inner Loop Header: Depth=1
+	vmulsd	%xmm2, %xmm0, %xmm0
+	decl	%r12d
+	vucomisd	%xmm0, %xmm1
+	ja	.LBB117_23
+# %bb.24:                               # %while.cond.while.cond40thread-pre-split_crit_edge
+	vmovsd	%xmm0, -48(%rbp)
+.LBB117_25:                             # %while.cond40thread-pre-split
+	vucomisd	.LCPI117_2(%rip), %xmm0
+	jb	.LBB117_29
+# %bb.26:                               # %while.body42.preheader
+	vmovsd	.LCPI117_2(%rip), %xmm1         # xmm1 = mem[0],zero
+	.p2align	4, 0x90
+.LBB117_27:                             # %while.body42
+                                        # =>This Inner Loop Header: Depth=1
+	vdivsd	%xmm1, %xmm0, %xmm0
+	incl	%r12d
+	vucomisd	%xmm1, %xmm0
+	jae	.LBB117_27
+# %bb.28:                               # %while.cond40.while.end43_crit_edge
+	vmovsd	%xmm0, -48(%rbp)
+.LBB117_29:                             # %while.end43
+	vmovsd	.LCPI117_3(%rip), %xmm1         # xmm1 = mem[0],zero
+	vfmadd213sd	.LCPI117_4(%rip), %xmm0, %xmm1 # xmm1 = (xmm0 * xmm1) + mem
+	vcvttsd2si	%xmm1, %rax
+	movq	%rax, %rcx
+	vsubsd	.LCPI117_5(%rip), %xmm1, %xmm0
+	sarq	$63, %rcx
+	vcvttsd2si	%xmm0, %rdx
+	andq	%rcx, %rdx
+	orq	%rax, %rdx
+	movabsq	$4835703278458516699, %rax      # imm = 0x431BDE82D7B634DB
+	mulxq	%rax, %rax, %rax
+	shrq	$18, %rax
+	imulq	$-1000000, %rax, %r15           # imm = 0xFFF0BDC0
+	addq	%rdx, %r15
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	movq	%rax, %rdx
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.30.142(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r15, %rdx
+	movl	$6, %ecx
+	callq	halide_int64_to_string@PLT
+	testl	%r12d, %r12d
+	js	.LBB117_31
+# %bb.30:                               # %if.then53
+	leaq	.L.str.11.67(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	jmp	.LBB117_32
+.LBB117_6:                              # %if.else9
+	testq	%rax, %rax
+	js	.LBB117_7
+# %bb.8:                                # %if.else13
+	leaq	.L.str.4.60(%rip), %rdx
+	jmp	.LBB117_4
+.LBB117_3:                              # %if.then6
+	leaq	.L.str.1.57(%rip), %rdx
+	jmp	.LBB117_4
+.LBB117_15:                             # %if.else26
+	testq	%rax, %rax
+	js	.LBB117_16
+# %bb.17:                               # %if.else30
+	leaq	.L.str.8.64(%rip), %rdx
+	jmp	.LBB117_4
+.LBB117_19:                             # %if.then34
+	leaq	.L.str.9.65(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %r14
+	vmovsd	-48(%rbp), %xmm0                # xmm0 = mem[0],zero
+	vxorpd	.LCPI117_0(%rip), %xmm0, %xmm0
+	vmovlpd	%xmm0, -48(%rbp)
+	testl	%r12d, %r12d
+	jne	.LBB117_21
+.LBB117_35:                             # %if.else61
+	testl	%r13d, %r13d
+	je	.LBB117_36
+# %bb.37:                               # %if.end65
+	movq	%r14, -64(%rbp)                 # 8-byte Spill
+	movabsq	$4503599627370495, %rax         # imm = 0xFFFFFFFFFFFFF
+	incq	%rax
+	orq	%rax, %r15
+	movl	%r13d, %r14d
+	subl	$1075, %r14d                    # imm = 0x433
+	jae	.LBB117_40
+# %bb.38:                               # %if.end100.thread
+	movb	$51, %dl
+	subb	%r13b, %dl
+	xorl	%eax, %eax
+	cmpl	$1023, %r13d                    # imm = 0x3FF
+	shrxq	%rdx, %r15, %rcx
+	shlxq	%rdx, %rcx, %rdx
+	cmovbq	%rax, %rcx
+	cmovbq	%rax, %rdx
+	subq	%rdx, %r15
+	vmovq	%r15, %xmm0
+	vmovdqa	.LCPI117_6(%rip), %xmm1         # xmm1 = [1127219200,1160773632,0,0]
+	vpunpckldq	%xmm1, %xmm0, %xmm0     # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+	vmovapd	.LCPI117_7(%rip), %xmm2         # xmm2 = [4.503599627370496E+15,1.9342813113834067E+25]
+	vsubpd	%xmm2, %xmm0, %xmm0
+	vpermilpd	$1, %xmm0, %xmm3        # xmm3 = xmm0[1,0]
+	vaddsd	%xmm0, %xmm3, %xmm0
+	shlq	$52, %r14
+	movabsq	$4696837146684686336, %rdx      # imm = 0x412E848000000000
+	addq	%r14, %rdx
+	vmovq	%rdx, %xmm3
+	vfmadd213sd	.LCPI117_4(%rip), %xmm0, %xmm3 # xmm3 = (xmm0 * xmm3) + mem
+	vcvttsd2si	%xmm3, %rdx
+	movq	%rdx, %rsi
+	sarq	$63, %rsi
+	vsubsd	.LCPI117_5(%rip), %xmm3, %xmm0
+	vcvttsd2si	%xmm0, %r15
+	andq	%rsi, %r15
+	orq	%rdx, %r15
+	vmovq	%r15, %xmm0
+	vpunpckldq	%xmm1, %xmm0, %xmm0     # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+	vsubpd	%xmm2, %xmm0, %xmm0
+	vpermilpd	$1, %xmm0, %xmm1        # xmm1 = xmm0[1,0]
+	vaddsd	%xmm0, %xmm1, %xmm0
+	vucomisd	%xmm0, %xmm3
+	setnp	%dl
+	sete	%sil
+	andb	%dl, %sil
+	movl	%r15d, %edx
+	andb	%sil, %dl
+	movzbl	%dl, %edx
+	subq	%rdx, %r15
+	xorl	%edx, %edx
+	cmpq	$1000000, %r15                  # imm = 0xF4240
+	cmoveq	%rax, %r15
+	sete	%dl
+	addq	%rcx, %rdx
+	leaq	-64(%rbp), %rsi
+	leaq	-96(%rbp), %r12
+	movq	%r12, %rdi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	jmp	.LBB117_39
+.LBB117_7:                              # %if.then11
+	leaq	.L.str.3.59(%rip), %rdx
+	jmp	.LBB117_4
+.LBB117_13:                             # %if.then22
+	leaq	.L.str.5.61(%rip), %rdx
+	jmp	.LBB117_4
+.LBB117_31:                             # %if.else55
+	leaq	.L.str.12.68(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	negl	%r12d
+.LBB117_32:                             # %if.end58
+	movl	%r12d, %edx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$2, %ecx
+	jmp	.LBB117_33
+.LBB117_16:                             # %if.then28
+	leaq	.L.str.7.63(%rip), %rdx
+.LBB117_4:                              # %cleanup143
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	jmp	.LBB117_34
+.LBB117_36:                             # %if.then63
+	vxorpd	%xmm0, %xmm0, %xmm0
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	xorl	%edx, %edx
+	callq	halide_double_to_string@PLT
+	jmp	.LBB117_34
+.LBB117_40:                             # %if.end100
+	leaq	-64(%rbp), %rsi
+	leaq	-96(%rbp), %r12
+	movq	%r12, %rdi
+	movq	%r15, %rdx
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	xorl	%r15d, %r15d
+	testl	%r14d, %r14d
+	je	.LBB117_39
+# %bb.41:                               # %for.cond107.preheader.preheader
+	xorl	%ecx, %ecx
+	jmp	.LBB117_42
+	.p2align	4, 0x90
+.LBB117_47:                             # %if.end133
+                                        #   in Loop: Header=BB117_42 Depth=1
+	movq	%rdx, %r12
+.LBB117_48:                             # %if.end133
+                                        #   in Loop: Header=BB117_42 Depth=1
+	incl	%ecx
+	cmpl	%r14d, %ecx
+	je	.LBB117_39
+.LBB117_42:                             # %for.cond107.preheader
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB117_44 Depth 2
+	movq	%r12, %rdx
+	movq	%rax, %r12
+	cmpq	%rdx, %rax
+	je	.LBB117_48
+# %bb.43:                               # %for.body111.preheader
+                                        #   in Loop: Header=BB117_42 Depth=1
+	xorl	%edi, %edi
+	movq	%rax, %rsi
+	.p2align	4, 0x90
+.LBB117_44:                             # %for.body111
+                                        #   Parent Loop BB117_42 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movzbl	-1(%rsi), %r8d
+	addb	$-48, %r8b
+	movzbl	%r8b, %r8d
+	addl	%r8d, %r8d
+	orl	%edi, %r8d
+	leal	-10(%r8), %r9d
+	xorl	%edi, %edi
+	cmpb	$10, %r8b
+	setge	%dil
+	movzbl	%r9b, %r9d
+	cmovll	%r8d, %r9d
+	addb	$48, %r9b
+	movb	%r9b, -1(%rsi)
+	leaq	-1(%rsi), %r9
+	movq	%r9, %rsi
+	cmpq	%rdx, %r9
+	jne	.LBB117_44
+# %bb.45:                               # %for.cond.cleanup110
+                                        #   in Loop: Header=BB117_42 Depth=1
+	cmpb	$9, %r8b
+	jle	.LBB117_47
+# %bb.46:                               # %if.then131
+                                        #   in Loop: Header=BB117_42 Depth=1
+	movb	$49, -1(%rdx)
+	decq	%rdx
+	jmp	.LBB117_47
+.LBB117_39:                             # %for.cond.cleanup
+	movq	-64(%rbp), %rdi                 # 8-byte Reload
+	movq	%rbx, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.30.142(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r15, %rdx
+	movl	$6, %ecx
+.LBB117_33:                             # %cleanup143
+	callq	halide_int64_to_string@PLT
+.LBB117_34:                             # %cleanup143
+	addq	$536, %rsp                      # imm = 0x218
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end117:
+	.size	halide_double_to_string, .Lfunc_end117-halide_double_to_string
+                                        # -- End function
+	.section	.text.halide_pointer_to_string,"ax",@progbits
+	.weak	halide_pointer_to_string        # -- Begin function halide_pointer_to_string
+	.p2align	4, 0x90
+	.type	halide_pointer_to_string,@function
+halide_pointer_to_string:               # @halide_pointer_to_string
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$32, %rsp
+	vxorps	%xmm0, %xmm0, %xmm0
+	vmovaps	%xmm0, -32(%rbp)
+	movl	$0, -16(%rbp)
+	movl	%edx, %eax
+	andl	$15, %eax
+	leaq	.L.str.13.71(%rip), %rcx
+	movzbl	(%rax,%rcx), %r8d
+	leaq	-15(%rbp), %rax
+	movb	%r8b, -14(%rbp)
+	cmpq	$16, %rdx
+	jae	.LBB118_2
+# %bb.1:
+	leaq	-14(%rbp), %rdx
+	jmp	.LBB118_22
+.LBB118_2:                              # %for.inc
+	movl	%edx, %r8d
+	shrl	$4, %r8d
+	andl	$15, %r8d
+	movzbl	(%r8,%rcx), %r9d
+	leaq	-16(%rbp), %r8
+	movb	%r9b, -15(%rbp)
+	cmpq	$256, %rdx                      # imm = 0x100
+	jb	.LBB118_21
+# %bb.3:                                # %for.inc.1
+	movl	%edx, %eax
+	shrl	$8, %eax
+	andl	$15, %eax
+	movzbl	(%rax,%rcx), %r9d
+	leaq	-17(%rbp), %rax
+	movb	%r9b, -16(%rbp)
+	cmpq	$4096, %rdx                     # imm = 0x1000
+	jb	.LBB118_4
+# %bb.5:                                # %for.inc.2
+	movl	%edx, %r8d
+	shrl	$12, %r8d
+	andl	$15, %r8d
+	movzbl	(%r8,%rcx), %r9d
+	leaq	-18(%rbp), %r8
+	movb	%r9b, -17(%rbp)
+	cmpq	$65536, %rdx                    # imm = 0x10000
+	jb	.LBB118_21
+# %bb.6:                                # %for.inc.3
+	movl	%edx, %eax
+	shrl	$16, %eax
+	andl	$15, %eax
+	movzbl	(%rax,%rcx), %r9d
+	leaq	-19(%rbp), %rax
+	movb	%r9b, -18(%rbp)
+	cmpq	$1048576, %rdx                  # imm = 0x100000
+	jb	.LBB118_4
+# %bb.8:                                # %for.inc.4
+	movl	%edx, %r8d
+	shrl	$20, %r8d
+	andl	$15, %r8d
+	movzbl	(%r8,%rcx), %r9d
+	leaq	-20(%rbp), %r8
+	movb	%r9b, -19(%rbp)
+	cmpq	$16777216, %rdx                 # imm = 0x1000000
+	jb	.LBB118_21
+# %bb.9:                                # %for.inc.5
+	movl	%edx, %eax
+	shrl	$24, %eax
+	andl	$15, %eax
+	movzbl	(%rax,%rcx), %r9d
+	leaq	-21(%rbp), %rax
+	movb	%r9b, -20(%rbp)
+	cmpq	$268435456, %rdx                # imm = 0x10000000
+	jb	.LBB118_4
+# %bb.11:                               # %for.inc.6
+	movl	%edx, %r8d
+	shrl	$28, %r8d
+	movzbl	(%r8,%rcx), %r9d
+	leaq	-22(%rbp), %r8
+	movb	%r9b, -21(%rbp)
+	movq	%rdx, %r9
+	shrq	$32, %r9
+	je	.LBB118_21
+# %bb.12:                               # %for.inc.7
+	andl	$15, %r9d
+	movzbl	(%r9,%rcx), %r9d
+	leaq	-23(%rbp), %rax
+	movb	%r9b, -22(%rbp)
+	movq	%rdx, %r9
+	shrq	$36, %r9
+	je	.LBB118_4
+# %bb.14:                               # %for.inc.8
+	andl	$15, %r9d
+	movzbl	(%r9,%rcx), %r9d
+	leaq	-24(%rbp), %r8
+	movb	%r9b, -23(%rbp)
+	movq	%rdx, %r9
+	shrq	$40, %r9
+	je	.LBB118_21
+# %bb.15:                               # %for.inc.9
+	andl	$15, %r9d
+	movzbl	(%r9,%rcx), %r9d
+	leaq	-25(%rbp), %rax
+	movb	%r9b, -24(%rbp)
+	movq	%rdx, %r9
+	shrq	$44, %r9
+	je	.LBB118_4
+# %bb.17:                               # %for.inc.10
+	andl	$15, %r9d
+	movzbl	(%r9,%rcx), %r9d
+	leaq	-26(%rbp), %r8
+	movb	%r9b, -25(%rbp)
+	movq	%rdx, %r9
+	shrq	$48, %r9
+	je	.LBB118_21
+# %bb.18:                               # %for.inc.11
+	andl	$15, %r9d
+	movzbl	(%r9,%rcx), %r9d
+	leaq	-27(%rbp), %rax
+	movb	%r9b, -26(%rbp)
+	movq	%rdx, %r9
+	shrq	$52, %r9
+	je	.LBB118_4
+# %bb.20:                               # %for.inc.12
+	andl	$15, %r9d
+	movzbl	(%r9,%rcx), %r9d
+	leaq	-28(%rbp), %r8
+	movb	%r9b, -27(%rbp)
+	movq	%rdx, %r9
+	shrq	$56, %r9
+	jne	.LBB118_23
+.LBB118_21:
+	movq	%rax, %rdx
+	movq	%r8, %rax
+	jmp	.LBB118_22
+.LBB118_23:                             # %for.inc.13
+	andl	$15, %r9d
+	movzbl	(%r9,%rcx), %r9d
+	leaq	-29(%rbp), %rax
+	movb	%r9b, -28(%rbp)
+	shrq	$60, %rdx
+	jne	.LBB118_25
+.LBB118_4:
+	movq	%r8, %rdx
+.LBB118_22:                             # %cleanup
+	movb	$120, (%rax)
+	movb	$48, -2(%rdx)
+	addq	$-2, %rdx
+	callq	halide_string_to_string@PLT
+	addq	$32, %rsp
+	popq	%rbp
+	retq
+.LBB118_25:                             # %for.inc.14
+	movzbl	(%rdx,%rcx), %ecx
+	movq	%rax, %rdx
+	leaq	-30(%rbp), %rax
+	movb	%cl, -29(%rbp)
+	jmp	.LBB118_22
+.Lfunc_end118:
+	.size	halide_pointer_to_string, .Lfunc_end118-halide_pointer_to_string
+                                        # -- End function
+	.section	.text.halide_type_to_string,"ax",@progbits
+	.weak	halide_type_to_string           # -- Begin function halide_type_to_string
+	.p2align	4, 0x90
+	.type	halide_type_to_string,@function
+halide_type_to_string:                  # @halide_type_to_string
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdx, %r14
+	movq	%rsi, %rbx
+	movsbq	(%rdx), %rax
+	cmpq	$4, %rax
+	ja	.LBB119_1
+# %bb.2:                                # %switch.lookup
+	leaq	.Lreltable.halide_type_to_string(%rip), %rcx
+	movslq	(%rcx,%rax,4), %rdx
+	addq	%rcx, %rdx
+	jmp	.LBB119_3
+.LBB119_1:
+	leaq	.L.str.19.72(%rip), %rdx
+.LBB119_3:                              # %sw.epilog
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movzbl	1(%r14), %edx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	cmpw	$1, 2(%r14)
+	jne	.LBB119_5
+# %bb.4:                                # %if.end
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.LBB119_5:                              # %if.then
+	leaq	.L.str.20.78(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movzwl	2(%r14), %edx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	jmp	halide_uint64_to_string@PLT     # TAILCALL
+.Lfunc_end119:
+	.size	halide_type_to_string, .Lfunc_end119-halide_type_to_string
+                                        # -- End function
+	.section	.text.halide_buffer_to_string,"ax",@progbits
+	.weak	halide_buffer_to_string         # -- Begin function halide_buffer_to_string
+	.p2align	4, 0x90
+	.type	halide_buffer_to_string,@function
+halide_buffer_to_string:                # @halide_buffer_to_string
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rsi, %rbx
+	testq	%rdx, %rdx
+	je	.LBB120_1
+# %bb.3:                                # %if.end
+	movq	%rdx, %r14
+	movq	%rbx, %rsi
+	callq	halide_pointer_to_string@PLT
+	leaq	.L.str.22.80(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movq	(%r14), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	leaq	.L.str.55(%rip), %r15
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	movq	8(%r14), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_pointer_to_string@PLT
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	movq	16(%r14), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_pointer_to_string@PLT
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	movq	24(%r14), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	32(%r14), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_type_to_string@PLT
+	cmpl	$0, 36(%r14)
+	jle	.LBB120_6
+# %bb.4:                                # %for.body.lr.ph
+	xorl	%r15d, %r15d
+	leaq	.L.str.55(%rip), %r12
+	xorl	%r13d, %r13d
+	.p2align	4, 0x90
+.LBB120_5:                              # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	leaq	.L.str.24.83(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	40(%r14), %rcx
+	movslq	(%rcx,%r15), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	movq	40(%r14), %rcx
+	movslq	4(%rcx,%r15), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	movq	40(%r14), %rcx
+	movslq	8(%rcx,%r15), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	leaq	.L.str.25.84(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	incq	%r13
+	movslq	36(%r14), %rcx
+	addq	$16, %r15
+	cmpq	%rcx, %r13
+	jl	.LBB120_5
+.LBB120_6:                              # %for.cond.cleanup
+	leaq	.L.str.8.120(%rip), %rdx
+	movq	%rax, %rdi
+	jmp	.LBB120_2
+.LBB120_1:                              # %if.then
+	leaq	.L.str.21.79(%rip), %rdx
+.LBB120_2:                              # %if.then
+	movq	%rbx, %rsi
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	jmp	halide_string_to_string@PLT     # TAILCALL
+.Lfunc_end120:
+	.size	halide_buffer_to_string, .Lfunc_end120-halide_buffer_to_string
+                                        # -- End function
+	.section	.text.halide_internal_malloc_alignment,"ax",@progbits
+	.weak	halide_internal_malloc_alignment # -- Begin function halide_internal_malloc_alignment
+	.p2align	4, 0x90
+	.type	halide_internal_malloc_alignment,@function
+halide_internal_malloc_alignment:       # @halide_internal_malloc_alignment
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movl	$64, %eax
+	popq	%rbp
+	retq
+.Lfunc_end121:
+	.size	halide_internal_malloc_alignment, .Lfunc_end121-halide_internal_malloc_alignment
+                                        # -- End function
+	.section	.text.halide_fopen,"ax",@progbits
+	.weak	halide_fopen                    # -- Begin function halide_fopen
+	.p2align	4, 0x90
+	.type	halide_fopen,@function
+halide_fopen:                           # @halide_fopen
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	jmp	fopen64@PLT                     # TAILCALL
+.Lfunc_end122:
+	.size	halide_fopen, .Lfunc_end122-halide_fopen
+                                        # -- End function
+	.section	.text.halide_reuse_device_allocations,"ax",@progbits
+	.weak	halide_reuse_device_allocations # -- Begin function halide_reuse_device_allocations
+	.p2align	4, 0x90
+	.type	halide_reuse_device_allocations,@function
+halide_reuse_device_allocations:        # @halide_reuse_device_allocations
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movq	_ZN6Halide7Runtime8Internal36halide_reuse_device_allocations_flagE@GOTPCREL(%rip), %rax
+	movb	%sil, (%rax)
+	xorl	%r14d, %r14d
+	testl	%esi, %esi
+	jne	.LBB123_4
+# %bb.1:                                # %if.then
+	movq	%rdi, %rbx
+	movq	_ZN6Halide7Runtime8Internal21allocation_pools_lockE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	movq	_ZN6Halide7Runtime8Internal23device_allocation_poolsE@GOTPCREL(%rip), %rax
+	movq	(%rax), %r15
+	xorl	%r14d, %r14d
+	testq	%r15, %r15
+	je	.LBB123_3
+	.p2align	4, 0x90
+.LBB123_5:                              # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%rbx, %rdi
+	callq	*(%r15)
+	testl	%eax, %eax
+	cmovnel	%eax, %r14d
+	movq	8(%r15), %r15
+	testq	%r15, %r15
+	jne	.LBB123_5
+.LBB123_3:                              # %for.cond.cleanup
+	movq	_ZN6Halide7Runtime8Internal21allocation_pools_lockE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_unlock@PLT
+.LBB123_4:                              # %if.end5
+	movl	%r14d, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end123:
+	.size	halide_reuse_device_allocations, .Lfunc_end123-halide_reuse_device_allocations
+                                        # -- End function
+	.section	.text.halide_can_reuse_device_allocations,"ax",@progbits
+	.weak	halide_can_reuse_device_allocations # -- Begin function halide_can_reuse_device_allocations
+	.p2align	4, 0x90
+	.type	halide_can_reuse_device_allocations,@function
+halide_can_reuse_device_allocations:    # @halide_can_reuse_device_allocations
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal36halide_reuse_device_allocations_flagE@GOTPCREL(%rip), %rax
+	movzbl	(%rax), %eax
+	popq	%rbp
+	retq
+.Lfunc_end124:
+	.size	halide_can_reuse_device_allocations, .Lfunc_end124-halide_can_reuse_device_allocations
+                                        # -- End function
+	.section	.text.halide_register_device_allocation_pool,"ax",@progbits
+	.weak	halide_register_device_allocation_pool # -- Begin function halide_register_device_allocation_pool
+	.p2align	4, 0x90
+	.type	halide_register_device_allocation_pool,@function
+halide_register_device_allocation_pool: # @halide_register_device_allocation_pool
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movq	_ZN6Halide7Runtime8Internal21allocation_pools_lockE@GOTPCREL(%rip), %r14
+	movq	%r14, %rdi
+	callq	halide_mutex_lock@PLT
+	movq	_ZN6Halide7Runtime8Internal23device_allocation_poolsE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rcx
+	movq	%rcx, 8(%rbx)
+	movq	%rbx, (%rax)
+	movq	%r14, %rdi
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	jmp	halide_mutex_unlock@PLT         # TAILCALL
+.Lfunc_end125:
+	.size	halide_register_device_allocation_pool, .Lfunc_end125-halide_register_device_allocation_pool
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal27copy_to_host_already_lockedEPvP15halide_buffer_t,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal27copy_to_host_already_lockedEPvP15halide_buffer_t # -- Begin function _ZN6Halide7Runtime8Internal27copy_to_host_already_lockedEPvP15halide_buffer_t
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal27copy_to_host_already_lockedEPvP15halide_buffer_t,@function
+_ZN6Halide7Runtime8Internal27copy_to_host_already_lockedEPvP15halide_buffer_t: # @_ZN6Halide7Runtime8Internal27copy_to_host_already_lockedEPvP15halide_buffer_t
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movq	24(%rsi), %rax
+	xorl	%ebx, %ebx
+	testb	$2, %al
+	je	.LBB126_6
+# %bb.1:                                # %if.end
+	movl	$-14, %ebx
+	testb	$1, %al
+	jne	.LBB126_6
+# %bb.2:                                # %if.end9
+	movq	%rsi, %r14
+	movq	8(%rsi), %rax
+	testq	%rax, %rax
+	je	.LBB126_3
+# %bb.4:                                # %if.end15
+	movq	%rdi, %r15
+	movq	120(%rax), %rax
+	movq	%r14, %rsi
+	callq	*48(%rax)
+	testl	%eax, %eax
+	jne	.LBB126_6
+# %bb.5:                                # %if.end22
+	andb	$-3, 24(%r14)
+	movq	%r15, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_buffer_is_initialized@PLT
+	xorl	%ebx, %ebx
+	jmp	.LBB126_6
+.LBB126_3:
+	movl	$-19, %ebx
+.LBB126_6:                              # %return
+	movl	%ebx, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end126:
+	.size	_ZN6Halide7Runtime8Internal27copy_to_host_already_lockedEPvP15halide_buffer_t, .Lfunc_end126-_ZN6Halide7Runtime8Internal27copy_to_host_already_lockedEPvP15halide_buffer_t
+                                        # -- End function
+	.section	.text.halide_device_release,"ax",@progbits
+	.weak	halide_device_release           # -- Begin function halide_device_release
+	.p2align	4, 0x90
+	.type	halide_device_release,@function
+halide_device_release:                  # @halide_device_release
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	120(%rsi), %rax
+	popq	%rbp
+	jmpq	*40(%rax)                       # TAILCALL
+.Lfunc_end127:
+	.size	halide_device_release, .Lfunc_end127-halide_device_release
+                                        # -- End function
+	.section	.text.halide_copy_to_host,"ax",@progbits
+	.weak	halide_copy_to_host             # -- Begin function halide_copy_to_host
+	.p2align	4, 0x90
+	.type	halide_copy_to_host,@function
+halide_copy_to_host:                    # @halide_copy_to_host
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movq	%rsi, %rbx
+	movq	%rdi, %r14
+	movq	_ZN6Halide7Runtime8Internal17device_copy_mutexE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	testq	%rbx, %rbx
+	je	.LBB128_1
+# %bb.2:                                # %if.end.i
+	movq	(%rbx), %rax
+	movq	8(%rbx), %rcx
+	testq	%rax, %rax
+	je	.LBB128_5
+# %bb.3:                                # %if.end.i
+	testq	%rcx, %rcx
+	jne	.LBB128_5
+# %bb.4:                                # %if.then8.i
+	movq	%r14, %rdi
+	callq	halide_error_no_device_interface@PLT
+	movl	%eax, %r15d
+	testl	%eax, %eax
+	jne	.LBB128_12
+	jmp	.LBB128_11
+.LBB128_1:                              # %if.then.i
+	leaq	.L.str.6.91(%rip), %rsi
+	movq	%r14, %rdi
+	callq	halide_error_buffer_is_null@PLT
+	movl	%eax, %r15d
+	testl	%eax, %eax
+	jne	.LBB128_12
+	jmp	.LBB128_11
+.LBB128_5:                              # %if.end16.i
+	testq	%rcx, %rcx
+	je	.LBB128_8
+# %bb.6:                                # %if.end16.i
+	testq	%rax, %rax
+	jne	.LBB128_8
+# %bb.7:                                # %if.then20.i
+	movq	%r14, %rdi
+	callq	halide_error_device_interface_no_device@PLT
+	movl	%eax, %r15d
+	testl	%eax, %eax
+	jne	.LBB128_12
+	jmp	.LBB128_11
+.LBB128_8:                              # %if.end28.i
+	movl	24(%rbx), %eax
+	notl	%eax
+	testb	$3, %al
+	jne	.LBB128_11
+# %bb.9:                                # %if.then36.i
+	movq	%r14, %rdi
+	callq	halide_error_host_and_device_dirty@PLT
+	movl	%eax, %r15d
+	testl	%eax, %eax
+	jne	.LBB128_12
+.LBB128_11:                             # %_ZN12_GLOBAL__N_126debug_log_and_validate_bufEPvPK15halide_buffer_tPKc.exit.split
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	_ZN6Halide7Runtime8Internal27copy_to_host_already_lockedEPvP15halide_buffer_t@PLT
+	movl	%eax, %r15d
+.LBB128_12:                             # %cleanup
+	movq	_ZN6Halide7Runtime8Internal17device_copy_mutexE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_unlock@PLT
+	movl	%r15d, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end128:
+	.size	halide_copy_to_host, .Lfunc_end128-halide_copy_to_host
+                                        # -- End function
+	.section	.text.copy_to_device_already_locked,"ax",@progbits
+	.weak	copy_to_device_already_locked   # -- Begin function copy_to_device_already_locked
+	.p2align	4, 0x90
+	.type	copy_to_device_already_locked,@function
+copy_to_device_already_locked:          # @copy_to_device_already_locked
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rdx, %r15
+	movq	%rsi, %r14
+	movq	%rdi, %rbx
+	testq	%rsi, %rsi
+	je	.LBB129_1
+# %bb.2:                                # %if.end.i
+	movq	(%r14), %rax
+	movq	8(%r14), %rcx
+	testq	%rax, %rax
+	je	.LBB129_5
+# %bb.3:                                # %if.end.i
+	testq	%rcx, %rcx
+	jne	.LBB129_5
+# %bb.4:                                # %if.then8.i
+	movq	%rbx, %rdi
+	callq	halide_error_no_device_interface@PLT
+	movl	%eax, %r12d
+	testl	%eax, %eax
+	jne	.LBB129_25
+	jmp	.LBB129_11
+.LBB129_1:                              # %if.then.i
+	leaq	.L.str.7.92(%rip), %rsi
+	movq	%rbx, %rdi
+	callq	halide_error_buffer_is_null@PLT
+	movl	%eax, %r12d
+	testl	%eax, %eax
+	jne	.LBB129_25
+	jmp	.LBB129_11
+.LBB129_5:                              # %if.end16.i
+	testq	%rcx, %rcx
+	je	.LBB129_8
+# %bb.6:                                # %if.end16.i
+	testq	%rax, %rax
+	jne	.LBB129_8
+# %bb.7:                                # %if.then20.i
+	movq	%rbx, %rdi
+	callq	halide_error_device_interface_no_device@PLT
+	movl	%eax, %r12d
+	testl	%eax, %eax
+	jne	.LBB129_25
+	jmp	.LBB129_11
+.LBB129_8:                              # %if.end28.i
+	movl	24(%r14), %eax
+	notl	%eax
+	testb	$3, %al
+	jne	.LBB129_11
+# %bb.9:                                # %if.then36.i
+	movq	%rbx, %rdi
+	callq	halide_error_host_and_device_dirty@PLT
+	movl	%eax, %r12d
+	testl	%eax, %eax
+	jne	.LBB129_25
+.LBB129_11:                             # %if.end
+	testq	%r15, %r15
+	jne	.LBB129_14
+# %bb.12:                               # %if.then1
+	movq	8(%r14), %r15
+	testq	%r15, %r15
+	je	.LBB129_13
+.LBB129_14:                             # %if.end9
+	cmpq	$0, (%r14)
+	je	.LBB129_20
+# %bb.15:                               # %land.lhs.true
+	cmpq	%r15, 8(%r14)
+	je	.LBB129_21
+# %bb.16:                               # %if.then13
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB129_17
+# %bb.18:                               # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.9.93(%rip), %rdx
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB129_19
+.LBB129_20:                             # %if.then19
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%r15, %rdx
+	callq	halide_device_malloc@PLT
+	movl	%eax, %r12d
+	testl	%eax, %eax
+	jne	.LBB129_25
+.LBB129_21:                             # %if.end28
+	movq	24(%r14), %rax
+	xorl	%r12d, %r12d
+	testb	$1, %al
+	je	.LBB129_25
+# %bb.22:                               # %if.then30
+	movl	$-15, %r12d
+	testb	$2, %al
+	jne	.LBB129_25
+# %bb.23:                               # %if.else
+	movq	120(%r15), %rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	*56(%rax)
+	testl	%eax, %eax
+	jne	.LBB129_25
+# %bb.24:                               # %if.then47
+	andb	$-2, 24(%r14)
+	xorl	%r12d, %r12d
+	jmp	.LBB129_25
+.LBB129_13:
+	movl	$-19, %r12d
+	jmp	.LBB129_25
+.LBB129_17:                             # %if.then.i106
+	leaq	.L.str.9.93(%rip), %rdx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB129_19:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-42, %r12d
+.LBB129_25:                             # %cleanup
+	movl	%r12d, %eax
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end129:
+	.size	copy_to_device_already_locked, .Lfunc_end129-copy_to_device_already_locked
+                                        # -- End function
+	.section	.text.halide_device_malloc,"ax",@progbits
+	.weak	halide_device_malloc            # -- Begin function halide_device_malloc
+	.p2align	4, 0x90
+	.type	halide_device_malloc,@function
+halide_device_malloc:                   # @halide_device_malloc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rdx, %rbx
+	movq	%rsi, %r15
+	movq	%rdi, %r14
+	testq	%rsi, %rsi
+	je	.LBB130_1
+# %bb.2:                                # %if.end.i
+	movq	(%r15), %rcx
+	movq	8(%r15), %rax
+	testq	%rcx, %rcx
+	je	.LBB130_5
+# %bb.3:                                # %if.end.i
+	testq	%rax, %rax
+	jne	.LBB130_5
+# %bb.4:                                # %if.then8.i
+	movq	%r14, %rdi
+	callq	halide_error_no_device_interface@PLT
+	testl	%eax, %eax
+	jne	.LBB130_16
+	jmp	.LBB130_11
+.LBB130_1:                              # %if.then.i
+	leaq	.L.str.17.94(%rip), %rsi
+	movq	%r14, %rdi
+	callq	halide_error_buffer_is_null@PLT
+	testl	%eax, %eax
+	jne	.LBB130_16
+	jmp	.LBB130_11
+.LBB130_5:                              # %if.end16.i
+	testq	%rax, %rax
+	je	.LBB130_8
+# %bb.6:                                # %if.end16.i
+	testq	%rcx, %rcx
+	jne	.LBB130_8
+# %bb.7:                                # %if.then20.i
+	movq	%r14, %rdi
+	callq	halide_error_device_interface_no_device@PLT
+	testl	%eax, %eax
+	jne	.LBB130_16
+	jmp	.LBB130_11
+.LBB130_8:                              # %if.end28.i
+	movl	24(%r15), %ecx
+	notl	%ecx
+	testb	$3, %cl
+	jne	.LBB130_12
+# %bb.9:                                # %if.then36.i
+	movq	%r14, %rdi
+	callq	halide_error_host_and_device_dirty@PLT
+	testl	%eax, %eax
+	jne	.LBB130_16
+.LBB130_11:                             # %_ZN12_GLOBAL__N_126debug_log_and_validate_bufEPvPK15halide_buffer_tPKc.exit.if.end_crit_edge
+	movq	8(%r15), %rax
+.LBB130_12:                             # %if.end
+	testq	%rax, %rax
+	je	.LBB130_15
+# %bb.13:                               # %if.end
+	cmpq	%rbx, %rax
+	je	.LBB130_15
+# %bb.14:                               # %if.then5
+	leaq	.L.str.20.95(%rip), %rsi
+	movq	%r14, %rdi
+	callq	halide_error@PLT
+	movl	$-42, %eax
+	jmp	.LBB130_16
+.LBB130_15:                             # %_ZN12_GLOBAL__N_121call_device_interfaceIPFiPvP15halide_buffer_tEJRS1_RS3_EEEiPK25halide_device_interface_tT_DpOT0_.exit
+	movq	120(%rbx), %rax
+	movq	16(%rax), %r12
+	callq	*(%rax)
+	movq	%r14, %rdi
+	movq	%r15, %rsi
+	callq	*%r12
+	movl	%eax, %r14d
+	movq	120(%rbx), %rax
+	callq	*8(%rax)
+	xorl	%eax, %eax
+	testl	%r14d, %r14d
+	sete	%al
+	shll	$4, %eax
+	addl	$-16, %eax
+.LBB130_16:                             # %cleanup13
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end130:
+	.size	halide_device_malloc, .Lfunc_end130-halide_device_malloc
+                                        # -- End function
+	.section	.text.halide_copy_to_device,"ax",@progbits
+	.weak	halide_copy_to_device           # -- Begin function halide_copy_to_device
+	.p2align	4, 0x90
+	.type	halide_copy_to_device,@function
+halide_copy_to_device:                  # @halide_copy_to_device
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rdx, %rbx
+	movq	%rsi, %r14
+	movq	%rdi, %r15
+	movq	_ZN6Halide7Runtime8Internal17device_copy_mutexE@GOTPCREL(%rip), %r12
+	movq	%r12, %rdi
+	callq	halide_mutex_lock@PLT
+	movq	%r15, %rdi
+	movq	%r14, %rsi
+	movq	%rbx, %rdx
+	callq	copy_to_device_already_locked@PLT
+	movl	%eax, %ebx
+	movq	%r12, %rdi
+	callq	halide_mutex_unlock@PLT
+	movl	%ebx, %eax
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end131:
+	.size	halide_copy_to_device, .Lfunc_end131-halide_copy_to_device
+                                        # -- End function
+	.section	.text.halide_device_sync,"ax",@progbits
+	.weak	halide_device_sync              # -- Begin function halide_device_sync
+	.p2align	4, 0x90
+	.type	halide_device_sync,@function
+halide_device_sync:                     # @halide_device_sync
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rsi, %rbx
+	movq	%rdi, %r14
+	testq	%rsi, %rsi
+	je	.LBB132_1
+# %bb.2:                                # %if.end.i
+	movq	(%rbx), %rcx
+	movq	8(%rbx), %rax
+	testq	%rcx, %rcx
+	je	.LBB132_5
+# %bb.3:                                # %if.end.i
+	testq	%rax, %rax
+	jne	.LBB132_5
+# %bb.4:                                # %if.then8.i
+	movq	%r14, %rdi
+	callq	halide_error_no_device_interface@PLT
+	testl	%eax, %eax
+	jne	.LBB132_14
+	jmp	.LBB132_11
+.LBB132_1:                              # %if.then.i
+	leaq	.L.str.16.96(%rip), %rsi
+	movq	%r14, %rdi
+	callq	halide_error_buffer_is_null@PLT
+	testl	%eax, %eax
+	jne	.LBB132_14
+	jmp	.LBB132_11
+.LBB132_5:                              # %if.end16.i
+	testq	%rax, %rax
+	je	.LBB132_8
+# %bb.6:                                # %if.end16.i
+	testq	%rcx, %rcx
+	jne	.LBB132_8
+# %bb.7:                                # %if.then20.i
+	movq	%r14, %rdi
+	callq	halide_error_device_interface_no_device@PLT
+	testl	%eax, %eax
+	je	.LBB132_11
+.LBB132_14:                             # %cleanup8
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.LBB132_8:                              # %if.end28.i
+	movl	24(%rbx), %ecx
+	notl	%ecx
+	testb	$3, %cl
+	jne	.LBB132_12
+# %bb.9:                                # %if.then36.i
+	movq	%r14, %rdi
+	callq	halide_error_host_and_device_dirty@PLT
+	testl	%eax, %eax
+	jne	.LBB132_14
+.LBB132_11:                             # %_ZN12_GLOBAL__N_126debug_log_and_validate_bufEPvPK15halide_buffer_tPKc.exit.if.end_crit_edge
+	movq	8(%rbx), %rax
+.LBB132_12:                             # %if.end
+	testq	%rax, %rax
+	je	.LBB132_15
+# %bb.13:                               # %if.end4
+	movq	120(%rax), %rax
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	*32(%rax)
+	movl	%eax, %ecx
+	testl	%eax, %eax
+	movl	$-17, %eax
+	cmovel	%ecx, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.LBB132_15:                             # %if.then2
+	movq	%r14, %rdi
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	jmp	halide_error_no_device_interface@PLT # TAILCALL
+.Lfunc_end132:
+	.size	halide_device_sync, .Lfunc_end132-halide_device_sync
+                                        # -- End function
+	.section	.text.halide_device_free,"ax",@progbits
+	.weak	halide_device_free              # -- Begin function halide_device_free
+	.p2align	4, 0x90
+	.type	halide_device_free,@function
+halide_device_free:                     # @halide_device_free
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rsi, %rbx
+	movq	%rdi, %r14
+	testq	%rsi, %rsi
+	je	.LBB133_1
+# %bb.2:                                # %if.end.i
+	movq	(%rbx), %rax
+	movq	8(%rbx), %r15
+	testq	%rax, %rax
+	je	.LBB133_5
+# %bb.3:                                # %if.end.i
+	testq	%r15, %r15
+	jne	.LBB133_5
+# %bb.4:                                # %if.then8.i
+	movq	%r14, %rdi
+	callq	halide_error_no_device_interface@PLT
+	testl	%eax, %eax
+	jne	.LBB133_15
+	jmp	.LBB133_11
+.LBB133_1:                              # %if.then.i
+	leaq	.L.str.21.99(%rip), %rsi
+	movq	%r14, %rdi
+	callq	halide_error_buffer_is_null@PLT
+	testl	%eax, %eax
+	jne	.LBB133_15
+	jmp	.LBB133_11
+.LBB133_5:                              # %if.end16.i
+	testq	%r15, %r15
+	je	.LBB133_8
+# %bb.6:                                # %if.end16.i
+	testq	%rax, %rax
+	jne	.LBB133_8
+# %bb.7:                                # %if.then20.i
+	movq	%r14, %rdi
+	callq	halide_error_device_interface_no_device@PLT
+	testl	%eax, %eax
+	jne	.LBB133_15
+	jmp	.LBB133_11
+.LBB133_8:                              # %if.end28.i
+	movl	24(%rbx), %eax
+	notl	%eax
+	testb	$3, %al
+	jne	.LBB133_12
+# %bb.9:                                # %if.then36.i
+	movq	%r14, %rdi
+	callq	halide_error_host_and_device_dirty@PLT
+	testl	%eax, %eax
+	jne	.LBB133_15
+.LBB133_11:                             # %_ZN12_GLOBAL__N_126debug_log_and_validate_bufEPvPK15halide_buffer_tPKc.exit.if.end_crit_edge
+	movq	8(%rbx), %r15
+.LBB133_12:                             # %if.end
+	testq	%r15, %r15
+	je	.LBB133_14
+# %bb.13:                               # %_ZN12_GLOBAL__N_121call_device_interfaceIPFiPvP15halide_buffer_tEJRS1_RS3_EEEiPK25halide_device_interface_tT_DpOT0_.exit
+	movq	120(%r15), %rax
+	movq	24(%rax), %r12
+	callq	*(%rax)
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	*%r12
+	movl	%eax, %r14d
+	movq	120(%r15), %rax
+	callq	*8(%rax)
+	movl	$-18, %eax
+	testl	%r14d, %r14d
+	jne	.LBB133_15
+.LBB133_14:                             # %if.end8
+	andb	$-3, 24(%rbx)
+	xorl	%eax, %eax
+.LBB133_15:                             # %cleanup10
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end133:
+	.size	halide_device_free, .Lfunc_end133-halide_device_free
+                                        # -- End function
+	.section	.text.halide_device_free_as_destructor,"ax",@progbits
+	.weak	halide_device_free_as_destructor # -- Begin function halide_device_free_as_destructor
+	.p2align	4, 0x90
+	.type	halide_device_free_as_destructor,@function
+halide_device_free_as_destructor:       # @halide_device_free_as_destructor
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	jmp	halide_device_free@PLT          # TAILCALL
+.Lfunc_end134:
+	.size	halide_device_free_as_destructor, .Lfunc_end134-halide_device_free_as_destructor
+                                        # -- End function
+	.section	.text.halide_device_and_host_malloc,"ax",@progbits
+	.weak	halide_device_and_host_malloc   # -- Begin function halide_device_and_host_malloc
+	.p2align	4, 0x90
+	.type	halide_device_and_host_malloc,@function
+halide_device_and_host_malloc:          # @halide_device_and_host_malloc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rdx, %r14
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	testq	%rsi, %rsi
+	je	.LBB135_1
+# %bb.2:                                # %if.end.i
+	movq	(%r15), %rcx
+	movq	8(%r15), %rax
+	testq	%rcx, %rcx
+	je	.LBB135_5
+# %bb.3:                                # %if.end.i
+	testq	%rax, %rax
+	jne	.LBB135_5
+# %bb.4:                                # %if.then8.i
+	movq	%rbx, %rdi
+	callq	halide_error_no_device_interface@PLT
+	testl	%eax, %eax
+	jne	.LBB135_19
+	jmp	.LBB135_11
+.LBB135_1:                              # %if.then.i
+	leaq	.L.str.22.100(%rip), %rsi
+	movq	%rbx, %rdi
+	callq	halide_error_buffer_is_null@PLT
+	testl	%eax, %eax
+	jne	.LBB135_19
+	jmp	.LBB135_11
+.LBB135_5:                              # %if.end16.i
+	testq	%rax, %rax
+	je	.LBB135_8
+# %bb.6:                                # %if.end16.i
+	testq	%rcx, %rcx
+	jne	.LBB135_8
+# %bb.7:                                # %if.then20.i
+	movq	%rbx, %rdi
+	callq	halide_error_device_interface_no_device@PLT
+	testl	%eax, %eax
+	jne	.LBB135_19
+	jmp	.LBB135_11
+.LBB135_8:                              # %if.end28.i
+	movl	24(%r15), %ecx
+	notl	%ecx
+	testb	$3, %cl
+	jne	.LBB135_12
+# %bb.9:                                # %if.then36.i
+	movq	%rbx, %rdi
+	callq	halide_error_host_and_device_dirty@PLT
+	testl	%eax, %eax
+	jne	.LBB135_19
+.LBB135_11:                             # %_ZN12_GLOBAL__N_126debug_log_and_validate_bufEPvPK15halide_buffer_tPKc.exit.if.end_crit_edge
+	movq	8(%r15), %rax
+.LBB135_12:                             # %if.end
+	testq	%rax, %rax
+	je	.LBB135_18
+# %bb.13:                               # %if.end
+	cmpq	%r14, %rax
+	je	.LBB135_18
+# %bb.14:                               # %if.then5
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB135_15
+# %bb.16:                               # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.24.101(%rip), %rdx
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB135_17
+.LBB135_18:                             # %_ZN12_GLOBAL__N_121call_device_interfaceIPFiPvP15halide_buffer_tEJRS1_RS3_EEEiPK25halide_device_interface_tT_DpOT0_.exit
+	movq	120(%r14), %rax
+	movq	64(%rax), %r12
+	callq	*(%rax)
+	movq	%rbx, %rdi
+	movq	%r15, %rsi
+	callq	*%r12
+	movl	%eax, %ebx
+	movq	120(%r14), %rax
+	callq	*8(%rax)
+	xorl	%eax, %eax
+	testl	%ebx, %ebx
+	sete	%al
+	shll	$4, %eax
+	addl	$-16, %eax
+	jmp	.LBB135_19
+.LBB135_15:                             # %if.then.i29
+	leaq	.L.str.24.101(%rip), %rdx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB135_17:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-42, %eax
+.LBB135_19:                             # %cleanup17
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end135:
+	.size	halide_device_and_host_malloc, .Lfunc_end135-halide_device_and_host_malloc
+                                        # -- End function
+	.section	.text.halide_device_and_host_free,"ax",@progbits
+	.weak	halide_device_and_host_free     # -- Begin function halide_device_and_host_free
+	.p2align	4, 0x90
+	.type	halide_device_and_host_free,@function
+halide_device_and_host_free:            # @halide_device_and_host_free
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rsi, %rbx
+	movq	%rdi, %r14
+	testq	%rsi, %rsi
+	je	.LBB136_1
+# %bb.2:                                # %if.end.i
+	movq	(%rbx), %rax
+	movq	8(%rbx), %r15
+	testq	%rax, %rax
+	je	.LBB136_5
+# %bb.3:                                # %if.end.i
+	testq	%r15, %r15
+	jne	.LBB136_5
+# %bb.4:                                # %if.then8.i
+	movq	%r14, %rdi
+	callq	halide_error_no_device_interface@PLT
+	testl	%eax, %eax
+	jne	.LBB136_17
+	jmp	.LBB136_11
+.LBB136_1:                              # %if.then.i
+	leaq	.L.str.26.102(%rip), %rsi
+	movq	%r14, %rdi
+	callq	halide_error_buffer_is_null@PLT
+	testl	%eax, %eax
+	jne	.LBB136_17
+	jmp	.LBB136_11
+.LBB136_5:                              # %if.end16.i
+	testq	%r15, %r15
+	je	.LBB136_8
+# %bb.6:                                # %if.end16.i
+	testq	%rax, %rax
+	jne	.LBB136_8
+# %bb.7:                                # %if.then20.i
+	movq	%r14, %rdi
+	callq	halide_error_device_interface_no_device@PLT
+	testl	%eax, %eax
+	jne	.LBB136_17
+	jmp	.LBB136_11
+.LBB136_8:                              # %if.end28.i
+	movl	24(%rbx), %eax
+	notl	%eax
+	testb	$3, %al
+	jne	.LBB136_12
+# %bb.9:                                # %if.then36.i
+	movq	%r14, %rdi
+	callq	halide_error_host_and_device_dirty@PLT
+	testl	%eax, %eax
+	jne	.LBB136_17
+.LBB136_11:                             # %_ZN12_GLOBAL__N_126debug_log_and_validate_bufEPvPK15halide_buffer_tPKc.exit.if.end_crit_edge
+	movq	8(%rbx), %r15
+.LBB136_12:                             # %if.end
+	testq	%r15, %r15
+	je	.LBB136_14
+# %bb.13:                               # %_ZN12_GLOBAL__N_121call_device_interfaceIPFiPvP15halide_buffer_tEJRS1_RS3_EEEiPK25halide_device_interface_tT_DpOT0_.exit
+	movq	120(%r15), %rax
+	movq	72(%rax), %r12
+	callq	*(%rax)
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	*%r12
+	movl	%eax, %r14d
+	movq	120(%r15), %rax
+	callq	*8(%rax)
+	movl	$-18, %eax
+	testl	%r14d, %r14d
+	jne	.LBB136_17
+	jmp	.LBB136_16
+.LBB136_14:                             # %if.else
+	movq	16(%rbx), %rsi
+	testq	%rsi, %rsi
+	je	.LBB136_16
+# %bb.15:                               # %if.then9
+	movq	%r14, %rdi
+	callq	halide_free@PLT
+	movq	$0, 16(%rbx)
+.LBB136_16:                             # %if.end13
+	andb	$-3, 24(%rbx)
+	xorl	%eax, %eax
+.LBB136_17:                             # %cleanup15
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end136:
+	.size	halide_device_and_host_free, .Lfunc_end136-halide_device_and_host_free
+                                        # -- End function
+	.section	.text.halide_default_device_and_host_malloc,"ax",@progbits
+	.weak	halide_default_device_and_host_malloc # -- Begin function halide_default_device_and_host_malloc
+	.p2align	4, 0x90
+	.type	halide_default_device_and_host_malloc,@function
+halide_default_device_and_host_malloc:  # @halide_default_device_and_host_malloc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rdx, %r15
+	movq	%rsi, %rbx
+	movq	%rdi, %r14
+	testq	%rsi, %rsi
+	je	.LBB137_1
+# %bb.2:                                # %if.end.i
+	movq	(%rbx), %rax
+	movq	8(%rbx), %rcx
+	testq	%rax, %rax
+	je	.LBB137_5
+# %bb.3:                                # %if.end.i
+	testq	%rcx, %rcx
+	jne	.LBB137_5
+# %bb.4:                                # %if.then8.i
+	movq	%r14, %rdi
+	callq	halide_error_no_device_interface@PLT
+	movl	%eax, %r12d
+	testl	%eax, %eax
+	jne	.LBB137_27
+	jmp	.LBB137_11
+.LBB137_1:                              # %if.then.i
+	leaq	.L.str.27.103(%rip), %rsi
+	movq	%r14, %rdi
+	callq	halide_error_buffer_is_null@PLT
+	movl	%eax, %r12d
+	testl	%eax, %eax
+	jne	.LBB137_27
+	jmp	.LBB137_11
+.LBB137_5:                              # %if.end16.i
+	testq	%rcx, %rcx
+	je	.LBB137_8
+# %bb.6:                                # %if.end16.i
+	testq	%rax, %rax
+	jne	.LBB137_8
+# %bb.7:                                # %if.then20.i
+	movq	%r14, %rdi
+	callq	halide_error_device_interface_no_device@PLT
+	movl	%eax, %r12d
+	testl	%eax, %eax
+	jne	.LBB137_27
+	jmp	.LBB137_11
+.LBB137_8:                              # %if.end28.i
+	movl	24(%rbx), %eax
+	notl	%eax
+	testb	$3, %al
+	jne	.LBB137_11
+# %bb.9:                                # %if.then36.i
+	movq	%r14, %rdi
+	callq	halide_error_host_and_device_dirty@PLT
+	movl	%eax, %r12d
+	testl	%eax, %eax
+	jne	.LBB137_27
+.LBB137_11:                             # %if.end
+	movl	36(%rbx), %ecx
+	testl	%ecx, %ecx
+	jle	.LBB137_12
+# %bb.13:                               # %for.body.lr.ph.i.i
+	movq	40(%rbx), %rdx
+	shlq	$4, %rcx
+	xorl	%esi, %esi
+	xorl	%eax, %eax
+	jmp	.LBB137_14
+	.p2align	4, 0x90
+.LBB137_16:                             # %if.end.i.i
+                                        #   in Loop: Header=BB137_14 Depth=1
+	addq	$16, %rsi
+	cmpq	%rsi, %rcx
+	je	.LBB137_17
+.LBB137_14:                             # %for.body.i.i
+                                        # =>This Inner Loop Header: Depth=1
+	movl	8(%rdx,%rsi), %edi
+	testl	%edi, %edi
+	jle	.LBB137_16
+# %bb.15:                               # %if.then.i.i
+                                        #   in Loop: Header=BB137_14 Depth=1
+	movslq	4(%rdx,%rsi), %r8
+	decq	%r8
+	imulq	%rdi, %r8
+	addq	%r8, %rax
+	jmp	.LBB137_16
+.LBB137_17:                             # %for.body.i12.i.preheader
+	xorl	%esi, %esi
+	xorl	%edi, %edi
+	jmp	.LBB137_18
+	.p2align	4, 0x90
+.LBB137_20:                             # %if.end.i23.i
+                                        #   in Loop: Header=BB137_18 Depth=1
+	addq	$16, %rsi
+	cmpq	%rsi, %rcx
+	je	.LBB137_21
+.LBB137_18:                             # %for.body.i12.i
+                                        # =>This Inner Loop Header: Depth=1
+	movslq	8(%rdx,%rsi), %r8
+	testq	%r8, %r8
+	jns	.LBB137_20
+# %bb.19:                               # %if.then.i19.i
+                                        #   in Loop: Header=BB137_18 Depth=1
+	movslq	4(%rdx,%rsi), %r9
+	decq	%r9
+	imulq	%r8, %r9
+	addq	%r9, %rdi
+	jmp	.LBB137_20
+.LBB137_21:                             # %_ZNK15halide_buffer_t12begin_offsetEv.exit.loopexit.i
+	subq	%rdi, %rax
+	incq	%rax
+	jmp	.LBB137_22
+.LBB137_12:
+	movl	$1, %eax
+.LBB137_22:                             # %_ZNK15halide_buffer_t13size_in_bytesEv.exit
+	movzbl	33(%rbx), %esi
+	addq	$7, %rsi
+	shrq	$3, %rsi
+	imulq	%rax, %rsi
+	movq	%r14, %rdi
+	callq	halide_malloc@PLT
+	movq	%rax, 16(%rbx)
+	testq	%rax, %rax
+	je	.LBB137_23
+# %bb.24:                               # %if.end5
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	movq	%r15, %rdx
+	callq	halide_device_malloc@PLT
+	testl	%eax, %eax
+	je	.LBB137_25
+# %bb.26:                               # %if.then8
+	movl	%eax, %r12d
+	movq	16(%rbx), %rsi
+	movq	%r14, %rdi
+	callq	halide_free@PLT
+	movq	$0, 16(%rbx)
+	jmp	.LBB137_27
+.LBB137_23:
+	movl	$-34, %r12d
+	jmp	.LBB137_27
+.LBB137_25:
+	xorl	%r12d, %r12d
+.LBB137_27:                             # %cleanup12
+	movl	%r12d, %eax
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end137:
+	.size	halide_default_device_and_host_malloc, .Lfunc_end137-halide_default_device_and_host_malloc
+                                        # -- End function
+	.section	.text.halide_default_device_and_host_free,"ax",@progbits
+	.weak	halide_default_device_and_host_free # -- Begin function halide_default_device_and_host_free
+	.p2align	4, 0x90
+	.type	halide_default_device_and_host_free,@function
+halide_default_device_and_host_free:    # @halide_default_device_and_host_free
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movq	%rsi, %rbx
+	movq	%rdi, %r14
+	testq	%rsi, %rsi
+	je	.LBB138_1
+# %bb.2:                                # %if.end.i
+	movq	(%rbx), %rax
+	movq	8(%rbx), %rcx
+	testq	%rax, %rax
+	je	.LBB138_5
+# %bb.3:                                # %if.end.i
+	testq	%rcx, %rcx
+	jne	.LBB138_5
+# %bb.4:                                # %if.then8.i
+	movq	%r14, %rdi
+	callq	halide_error_no_device_interface@PLT
+	movl	%eax, %r15d
+	testl	%eax, %eax
+	jne	.LBB138_14
+	jmp	.LBB138_11
+.LBB138_1:                              # %if.then.i
+	leaq	.L.str.28.104(%rip), %rsi
+	movq	%r14, %rdi
+	callq	halide_error_buffer_is_null@PLT
+	movl	%eax, %r15d
+	testl	%eax, %eax
+	jne	.LBB138_14
+	jmp	.LBB138_11
+.LBB138_5:                              # %if.end16.i
+	testq	%rcx, %rcx
+	je	.LBB138_8
+# %bb.6:                                # %if.end16.i
+	testq	%rax, %rax
+	jne	.LBB138_8
+# %bb.7:                                # %if.then20.i
+	movq	%r14, %rdi
+	callq	halide_error_device_interface_no_device@PLT
+	movl	%eax, %r15d
+	testl	%eax, %eax
+	jne	.LBB138_14
+	jmp	.LBB138_11
+.LBB138_8:                              # %if.end28.i
+	movl	24(%rbx), %eax
+	notl	%eax
+	testb	$3, %al
+	jne	.LBB138_11
+# %bb.9:                                # %if.then36.i
+	movq	%r14, %rdi
+	callq	halide_error_host_and_device_dirty@PLT
+	movl	%eax, %r15d
+	testl	%eax, %eax
+	jne	.LBB138_14
+.LBB138_11:                             # %_ZN12_GLOBAL__N_126debug_log_and_validate_bufEPvPK15halide_buffer_tPKc.exit.split
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	halide_device_free@PLT
+	movl	%eax, %r15d
+	movq	16(%rbx), %rsi
+	testq	%rsi, %rsi
+	je	.LBB138_13
+# %bb.12:                               # %if.then3
+	movq	%r14, %rdi
+	callq	halide_free@PLT
+	movq	$0, 16(%rbx)
+.LBB138_13:                             # %if.end6
+	andb	$-4, 24(%rbx)
+.LBB138_14:                             # %cleanup
+	movl	%r15d, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end138:
+	.size	halide_default_device_and_host_free, .Lfunc_end138-halide_default_device_and_host_free
+                                        # -- End function
+	.section	.text.halide_device_wrap_native,"ax",@progbits
+	.weak	halide_device_wrap_native       # -- Begin function halide_device_wrap_native
+	.p2align	4, 0x90
+	.type	halide_device_wrap_native,@function
+halide_device_wrap_native:              # @halide_device_wrap_native
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, %r14
+	movq	%rdx, %r15
+	movq	%rsi, %r12
+	movq	%rdi, %rbx
+	testq	%rsi, %rsi
+	je	.LBB139_1
+# %bb.2:                                # %if.end.i
+	movq	(%r12), %rcx
+	movq	8(%r12), %rax
+	testq	%rcx, %rcx
+	je	.LBB139_5
+# %bb.3:                                # %if.end.i
+	testq	%rax, %rax
+	jne	.LBB139_5
+# %bb.4:                                # %if.then8.i
+	movq	%rbx, %rdi
+	callq	halide_error_no_device_interface@PLT
+	testl	%eax, %eax
+	jne	.LBB139_19
+	jmp	.LBB139_11
+.LBB139_1:                              # %if.then.i
+	leaq	.L.str.29.105(%rip), %rsi
+	movq	%rbx, %rdi
+	callq	halide_error_buffer_is_null@PLT
+	testl	%eax, %eax
+	jne	.LBB139_19
+	jmp	.LBB139_11
+.LBB139_5:                              # %if.end16.i
+	testq	%rax, %rax
+	je	.LBB139_8
+# %bb.6:                                # %if.end16.i
+	testq	%rcx, %rcx
+	jne	.LBB139_8
+# %bb.7:                                # %if.then20.i
+	movq	%rbx, %rdi
+	callq	halide_error_device_interface_no_device@PLT
+	testl	%eax, %eax
+	jne	.LBB139_19
+	jmp	.LBB139_11
+.LBB139_8:                              # %if.end28.i
+	movl	24(%r12), %ecx
+	notl	%ecx
+	testb	$3, %cl
+	jne	.LBB139_12
+# %bb.9:                                # %if.then36.i
+	movq	%rbx, %rdi
+	callq	halide_error_host_and_device_dirty@PLT
+	testl	%eax, %eax
+	jne	.LBB139_19
+.LBB139_11:                             # %_ZN12_GLOBAL__N_126debug_log_and_validate_bufEPvPK15halide_buffer_tPKc.exit.if.end_crit_edge
+	movq	8(%r12), %rax
+.LBB139_12:                             # %if.end
+	testq	%rax, %rax
+	je	.LBB139_18
+# %bb.13:                               # %if.end
+	cmpq	%r14, %rax
+	je	.LBB139_18
+# %bb.14:                               # %if.then3
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB139_15
+# %bb.16:                               # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.30.106(%rip), %rdx
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB139_17
+.LBB139_18:                             # %_ZN12_GLOBAL__N_121call_device_interfaceIPFiPvP15halide_buffer_tyEJRS1_RS3_RyEEEiPK25halide_device_interface_tT_DpOT0_.exit
+	movq	%r14, 8(%r12)
+	movq	120(%r14), %rax
+	movq	112(%rax), %r13
+	callq	*(%rax)
+	movq	%rbx, %rdi
+	movq	%r12, %rsi
+	movq	%r15, %rdx
+	callq	*%r13
+	movl	%eax, %ebx
+	movq	120(%r14), %rax
+	callq	*8(%rax)
+	xorl	%eax, %eax
+	testl	%ebx, %ebx
+	sete	%al
+	shll	$4, %eax
+	addl	$-16, %eax
+	jmp	.LBB139_19
+.LBB139_15:                             # %if.then.i25
+	leaq	.L.str.30.106(%rip), %rdx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB139_17:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-42, %eax
+.LBB139_19:                             # %cleanup13
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end139:
+	.size	halide_device_wrap_native, .Lfunc_end139-halide_device_wrap_native
+                                        # -- End function
+	.section	.text.halide_device_detach_native,"ax",@progbits
+	.weak	halide_device_detach_native     # -- Begin function halide_device_detach_native
+	.p2align	4, 0x90
+	.type	halide_device_detach_native,@function
+halide_device_detach_native:            # @halide_device_detach_native
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rsi, %r14
+	movq	%rdi, %rbx
+	testq	%rsi, %rsi
+	je	.LBB140_1
+# %bb.2:                                # %if.end.i
+	movq	(%r14), %rax
+	movq	8(%r14), %r12
+	testq	%rax, %rax
+	je	.LBB140_5
+# %bb.3:                                # %if.end.i
+	testq	%r12, %r12
+	jne	.LBB140_5
+# %bb.4:                                # %if.then8.i
+	movq	%rbx, %rdi
+	callq	halide_error_no_device_interface@PLT
+	testl	%eax, %eax
+	jne	.LBB140_16
+	jmp	.LBB140_11
+.LBB140_1:                              # %if.then.i
+	leaq	.L.str.31.107(%rip), %rsi
+	movq	%rbx, %rdi
+	callq	halide_error_buffer_is_null@PLT
+	testl	%eax, %eax
+	jne	.LBB140_16
+	jmp	.LBB140_11
+.LBB140_5:                              # %if.end16.i
+	testq	%r12, %r12
+	je	.LBB140_8
+# %bb.6:                                # %if.end16.i
+	testq	%rax, %rax
+	jne	.LBB140_8
+# %bb.7:                                # %if.then20.i
+	movq	%rbx, %rdi
+	callq	halide_error_device_interface_no_device@PLT
+	testl	%eax, %eax
+	jne	.LBB140_16
+	jmp	.LBB140_11
+.LBB140_8:                              # %if.end28.i
+	movl	24(%r14), %eax
+	notl	%eax
+	testb	$3, %al
+	jne	.LBB140_12
+# %bb.9:                                # %if.then36.i
+	movq	%rbx, %rdi
+	callq	halide_error_host_and_device_dirty@PLT
+	testl	%eax, %eax
+	jne	.LBB140_16
+.LBB140_11:                             # %_ZN12_GLOBAL__N_126debug_log_and_validate_bufEPvPK15halide_buffer_tPKc.exit.if.end_crit_edge
+	movq	8(%r14), %r12
+.LBB140_12:                             # %if.end
+	xorl	%eax, %eax
+	testq	%r12, %r12
+	je	.LBB140_16
+# %bb.13:                               # %_ZN12_GLOBAL__N_121call_device_interfaceIPFiPvP15halide_buffer_tEJRS1_RS3_EEEiPK25halide_device_interface_tT_DpOT0_.exit
+	movq	120(%r12), %rax
+	movq	120(%rax), %r15
+	callq	*(%rax)
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	*%r15
+	movl	%eax, %r15d
+	movq	120(%r12), %rax
+	callq	*8(%rax)
+	movl	$-33, %eax
+	testl	%r15d, %r15d
+	jne	.LBB140_16
+# %bb.14:                               # %cleanup.cont
+	cmpq	$0, (%r14)
+	movl	$0, %eax
+	je	.LBB140_16
+# %bb.15:                               # %if.then9
+	leaq	.L.str.32.108(%rip), %rsi
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movl	$-22, %eax
+.LBB140_16:                             # %cleanup17
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end140:
+	.size	halide_device_detach_native, .Lfunc_end140-halide_device_detach_native
+                                        # -- End function
+	.section	.text.halide_default_device_wrap_native,"ax",@progbits
+	.weak	halide_default_device_wrap_native # -- Begin function halide_default_device_wrap_native
+	.p2align	4, 0x90
+	.type	halide_default_device_wrap_native,@function
+halide_default_device_wrap_native:      # @halide_default_device_wrap_native
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movl	$-32, %eax
+	cmpq	$0, (%rsi)
+	je	.LBB141_1
+# %bb.2:                                # %return
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.LBB141_1:                              # %if.end
+	movq	%rdx, %rbx
+	movq	%rsi, %r14
+	movq	8(%rsi), %rax
+	movq	120(%rax), %rax
+	callq	*(%rax)
+	movq	%rbx, (%r14)
+	xorl	%eax, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end141:
+	.size	halide_default_device_wrap_native, .Lfunc_end141-halide_default_device_wrap_native
+                                        # -- End function
+	.section	.text.halide_default_device_detach_native,"ax",@progbits
+	.weak	halide_default_device_detach_native # -- Begin function halide_default_device_detach_native
+	.p2align	4, 0x90
+	.type	halide_default_device_detach_native,@function
+halide_default_device_detach_native:    # @halide_default_device_detach_native
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rsi, %rbx
+	testq	%rsi, %rsi
+	je	.LBB142_1
+# %bb.2:                                # %if.end.i
+	movq	(%rbx), %rax
+	movq	8(%rbx), %rcx
+	testq	%rax, %rax
+	je	.LBB142_5
+# %bb.3:                                # %if.end.i
+	testq	%rcx, %rcx
+	jne	.LBB142_5
+# %bb.4:                                # %if.then8.i
+	callq	halide_error_no_device_interface@PLT
+	movl	%eax, %r14d
+	testl	%eax, %eax
+	jne	.LBB142_14
+	jmp	.LBB142_11
+.LBB142_1:                              # %if.then.i
+	leaq	.L.str.34.109(%rip), %rsi
+	callq	halide_error_buffer_is_null@PLT
+	movl	%eax, %r14d
+	testl	%eax, %eax
+	jne	.LBB142_14
+	jmp	.LBB142_11
+.LBB142_5:                              # %if.end16.i
+	testq	%rcx, %rcx
+	je	.LBB142_8
+# %bb.6:                                # %if.end16.i
+	testq	%rax, %rax
+	jne	.LBB142_8
+# %bb.7:                                # %if.then20.i
+	callq	halide_error_device_interface_no_device@PLT
+	movl	%eax, %r14d
+	testl	%eax, %eax
+	jne	.LBB142_14
+	jmp	.LBB142_11
+.LBB142_8:                              # %if.end28.i
+	movl	24(%rbx), %ecx
+	notl	%ecx
+	testb	$3, %cl
+	jne	.LBB142_12
+# %bb.9:                                # %if.then36.i
+	callq	halide_error_host_and_device_dirty@PLT
+	movl	%eax, %r14d
+	testl	%eax, %eax
+	jne	.LBB142_14
+.LBB142_11:                             # %_ZN12_GLOBAL__N_126debug_log_and_validate_bufEPvPK15halide_buffer_tPKc.exit.if.end_crit_edge
+	movq	(%rbx), %rax
+.LBB142_12:                             # %if.end
+	xorl	%r14d, %r14d
+	testq	%rax, %rax
+	je	.LBB142_14
+# %bb.13:                               # %if.end2
+	movq	8(%rbx), %rax
+	movq	120(%rax), %rax
+	callq	*8(%rax)
+	vxorps	%xmm0, %xmm0, %xmm0
+	vmovups	%xmm0, (%rbx)
+.LBB142_14:                             # %cleanup
+	movl	%r14d, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end142:
+	.size	halide_default_device_detach_native, .Lfunc_end142-halide_default_device_detach_native
+                                        # -- End function
+	.section	.text.halide_device_and_host_free_as_destructor,"ax",@progbits
+	.weak	halide_device_and_host_free_as_destructor # -- Begin function halide_device_and_host_free_as_destructor
+	.p2align	4, 0x90
+	.type	halide_device_and_host_free_as_destructor,@function
+halide_device_and_host_free_as_destructor: # @halide_device_and_host_free_as_destructor
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	jmp	halide_device_and_host_free@PLT # TAILCALL
+.Lfunc_end143:
+	.size	halide_device_and_host_free_as_destructor, .Lfunc_end143-halide_device_and_host_free_as_destructor
+                                        # -- End function
+	.section	.text.halide_device_host_nop_free,"ax",@progbits
+	.weak	halide_device_host_nop_free     # -- Begin function halide_device_host_nop_free
+	.p2align	4, 0x90
+	.type	halide_device_host_nop_free,@function
+halide_device_host_nop_free:            # @halide_device_host_nop_free
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	retq
+.Lfunc_end144:
+	.size	halide_device_host_nop_free, .Lfunc_end144-halide_device_host_nop_free
+                                        # -- End function
+	.section	.text.halide_default_buffer_copy,"ax",@progbits
+	.weak	halide_default_buffer_copy      # -- Begin function halide_default_buffer_copy
+	.p2align	4, 0x90
+	.type	halide_default_buffer_copy,@function
+halide_default_buffer_copy:             # @halide_default_buffer_copy
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movl	$-39, %eax
+	popq	%rbp
+	retq
+.Lfunc_end145:
+	.size	halide_default_buffer_copy, .Lfunc_end145-halide_default_buffer_copy
+                                        # -- End function
+	.section	.text.halide_buffer_copy_already_locked,"ax",@progbits
+	.weak	halide_buffer_copy_already_locked # -- Begin function halide_buffer_copy_already_locked
+	.p2align	4, 0x90
+	.type	halide_buffer_copy_already_locked,@function
+halide_buffer_copy_already_locked:      # @halide_buffer_copy_already_locked
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$440, %rsp                      # imm = 0x1B8
+	movq	%rdx, %r12
+	testq	%rdx, %rdx
+	je	.LBB146_5
+# %bb.1:                                # %land.lhs.true
+	movq	8(%rcx), %rax
+	testq	%rax, %rax
+	je	.LBB146_4
+# %bb.2:                                # %land.lhs.true
+	cmpq	%r12, %rax
+	je	.LBB146_4
+# %bb.3:                                # %if.then
+	leaq	.L.str.40(%rip), %rsi
+	callq	halide_error@PLT
+	movl	$-42, %r13d
+	jmp	.LBB146_46
+.LBB146_4:                              # %land.lhs.true5
+	cmpq	$0, (%rcx)
+	je	.LBB146_13
+.LBB146_5:                              # %if.end13
+	cmpq	$0, (%rsi)
+	movq	16(%rsi), %rax
+	je	.LBB146_9
+# %bb.6:                                # %land.rhs
+	testq	%rax, %rax
+	je	.LBB146_12
+# %bb.7:                                # %land.end.thread284
+	movq	24(%rsi), %rax
+	movl	%eax, %edx
+	andb	$1, %dl
+	testb	$2, %al
+	jne	.LBB146_11
+.LBB146_8:
+	xorl	%r8d, %r8d
+	xorl	%r9d, %r9d
+	jmp	.LBB146_16
+.LBB146_9:                              # %land.end
+	testq	%rax, %rax
+	je	.LBB146_14
+# %bb.10:                               # %land.end.land.rhs26_crit_edge
+	movq	24(%rsi), %rax
+	movb	$1, %dl
+	testb	$2, %al
+	je	.LBB146_8
+.LBB146_11:                             # %lor.rhs28
+	cmpq	$0, 8(%rsi)
+	setne	%r9b
+	xorl	%r8d, %r8d
+	jmp	.LBB146_16
+.LBB146_12:
+	movb	$1, %r8b
+	xorl	%edx, %edx
+	jmp	.LBB146_15
+.LBB146_13:                             # %if.then7
+	movq	%rdi, %r14
+	movq	%rsi, %r15
+	movq	%rcx, %rsi
+	movq	%r12, %rdx
+	movq	%rcx, %rbx
+	callq	halide_device_malloc@PLT
+	movq	%r15, %rsi
+	movq	%r14, %rdi
+	movq	%rbx, %rcx
+	movl	%eax, %r13d
+	testl	%eax, %eax
+	jne	.LBB146_46
+	jmp	.LBB146_5
+.LBB146_14:
+	movb	$1, %dl
+	movb	$1, %r8b
+.LBB146_15:                             # %land.end32
+	movb	$1, %r9b
+.LBB146_16:                             # %land.end32
+	testq	%r12, %r12
+	setne	%r10b
+	movq	16(%rcx), %r15
+	movl	$-34, %r13d
+	movq	%r12, %rax
+	orq	%r15, %rax
+	je	.LBB146_46
+# %bb.17:                               # %if.end41
+	testq	%r12, %r12
+	sete	%al
+	orb	%dl, %al
+	jne	.LBB146_19
+# %bb.18:                               # %if.end50
+	movq	120(%r12), %rax
+	movq	%rdi, %rbx
+	movq	%rsi, %r14
+	movl	%edx, -56(%rbp)                 # 4-byte Spill
+	movq	%r12, %rdx
+	movq	%rcx, %r13
+	movl	%r8d, -52(%rbp)                 # 4-byte Spill
+	movl	%r9d, -48(%rbp)                 # 4-byte Spill
+	movb	%r10b, -41(%rbp)                # 1-byte Spill
+	callq	*80(%rax)
+	movzbl	-41(%rbp), %r10d                # 1-byte Folded Reload
+	movl	-48(%rbp), %r9d                 # 4-byte Reload
+	movl	-52(%rbp), %r8d                 # 4-byte Reload
+	movl	-56(%rbp), %edx                 # 4-byte Reload
+	movq	%r14, %rsi
+	movq	%rbx, %rdi
+	movq	%r13, %rcx
+	movl	%eax, %r13d
+	cmpl	$-42, %eax
+	jne	.LBB146_36
+.LBB146_19:                             # %if.then52
+	testq	%r15, %r15
+	sete	%al
+	movl	$-42, %r13d
+	testb	%al, %r8b
+	jne	.LBB146_46
+# %bb.20:                               # %if.end59
+	orb	%r10b, %r9b
+	cmpb	$1, %r9b
+	jne	.LBB146_28
+# %bb.21:                               # %if.else
+	orb	%dl, %r10b
+	je	.LBB146_30
+# %bb.22:                               # %if.else82
+	testq	%r15, %r15
+	sete	%al
+	orb	%al, %dl
+	je	.LBB146_33
+# %bb.23:                               # %if.else99
+	testq	%r12, %r12
+	je	.LBB146_44
+# %bb.24:                               # %if.then101
+	movq	%rcx, %rbx
+	movq	%rdi, %r14
+	movq	%rsi, %r15
+	callq	_ZN6Halide7Runtime8Internal27copy_to_host_already_lockedEPvP15halide_buffer_t@PLT
+	movq	%r14, %rdi
+	movl	%eax, %r13d
+	testl	%eax, %eax
+	jne	.LBB146_44
+# %bb.25:                               # %if.end125.thread301
+	movq	120(%r12), %rax
+	movq	%r15, %rsi
+	movq	%r12, %rdx
+	movq	%rbx, %rcx
+	callq	*80(%rax)
+	movl	%eax, %r13d
+	cmpq	%r15, %rbx
+	je	.LBB146_42
+# %bb.26:                               # %if.end125.thread301
+	testl	%r13d, %r13d
+	movq	%r14, %rdi
+	jne	.LBB146_43
+# %bb.27:                               # %if.then129.thread307
+	movq	24(%rbx), %rax
+	andq	$-4, %rax
+	orq	$2, %rax
+	movq	%rax, 24(%rbx)
+	jmp	.LBB146_45
+.LBB146_28:                             # %if.end125.thread
+	leaq	-472(%rbp), %r12
+	movq	%rdi, %r14
+	movq	%r12, %rdi
+	movq	%rsi, %r15
+	movl	$1, %edx
+	movq	%rcx, %rbx
+	movl	$1, %r8d
+	callq	_ZN6Halide7Runtime8Internal16make_buffer_copyEPK15halide_buffer_tbS4_b@PLT
+	movq	%r12, %rdi
+	movq	%r14, %rsi
+	callq	_ZN6Halide7Runtime8Internal11copy_memoryERKNS1_11device_copyEPv@PLT
+	xorl	%r13d, %r13d
+	cmpq	%r15, %rbx
+	je	.LBB146_46
+# %bb.29:                               # %if.else134.thread
+	movq	24(%rbx), %rax
+	andq	$-4, %rax
+	orq	$1, %rax
+	movq	%rax, 24(%rbx)
+	jmp	.LBB146_46
+.LBB146_30:                             # %if.then67
+	movq	8(%rsi), %rax
+	movq	120(%rax), %rax
+	movq	%rdi, %r14
+	movq	%rsi, %r15
+	xorl	%edx, %edx
+	movq	%rcx, %rbx
+	callq	*80(%rax)
+	movq	%r15, %rsi
+	movq	%r14, %rdi
+	movq	%rbx, %rcx
+	movl	%eax, %r13d
+	cmpl	$-42, %eax
+	jne	.LBB146_36
+# %bb.31:                               # %if.then75
+	movq	%r14, %rdi
+	movq	%rsi, %r15
+	callq	_ZN6Halide7Runtime8Internal27copy_to_host_already_lockedEPvP15halide_buffer_t@PLT
+	movq	%r14, %rdi
+	movl	%eax, %r13d
+	testl	%eax, %eax
+	jne	.LBB146_44
+# %bb.32:                               # %if.then78
+	movq	%r14, %rdi
+	movq	%r15, %rsi
+	xorl	%edx, %edx
+	movq	%rbx, %rcx
+	callq	halide_buffer_copy_already_locked@PLT
+	jmp	.LBB146_35
+.LBB146_33:                             # %if.then86
+	movq	8(%rsi), %rax
+	movq	120(%rax), %rax
+	movq	%rdi, %r14
+	movq	%rsi, %r15
+	xorl	%edx, %edx
+	movq	%rcx, %rbx
+	callq	*80(%rax)
+	movq	%r14, %rdi
+	movl	%eax, %r13d
+	testl	%eax, %eax
+	jne	.LBB146_44
+# %bb.34:                               # %if.then96
+	orb	$1, 24(%rbx)
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	movq	%r12, %rdx
+	callq	copy_to_device_already_locked@PLT
+.LBB146_35:                             # %if.end125
+	movq	%r14, %rdi
+	movq	%rbx, %rcx
+	movl	%eax, %r13d
+	movq	%r15, %rsi
+.LBB146_36:                             # %if.end125
+	cmpq	%rsi, %rcx
+	je	.LBB146_43
+# %bb.37:                               # %if.end125
+	testl	%r13d, %r13d
+	jne	.LBB146_43
+# %bb.38:                               # %if.then129
+	movq	24(%rcx), %rax
+	andq	$-4, %rax
+	testq	%r12, %r12
+	je	.LBB146_40
+# %bb.39:                               # %if.then131
+	orq	$2, %rax
+	jmp	.LBB146_41
+.LBB146_40:                             # %if.else134
+	orq	$1, %rax
+.LBB146_41:                             # %return
+	movq	%rax, 24(%rcx)
+	jmp	.LBB146_45
+.LBB146_42:
+	movq	%r14, %rdi
+.LBB146_43:                             # %if.end138
+	testl	%r13d, %r13d
+	je	.LBB146_45
+.LBB146_44:                             # %if.then140
+	leaq	.L.str.53(%rip), %rsi
+	callq	halide_error@PLT
+	jmp	.LBB146_46
+.LBB146_45:
+	xorl	%r13d, %r13d
+.LBB146_46:                             # %return
+	movl	%r13d, %eax
+	addq	$440, %rsp                      # imm = 0x1B8
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end146:
+	.size	halide_buffer_copy_already_locked, .Lfunc_end146-halide_buffer_copy_already_locked
+                                        # -- End function
+	.section	.text.halide_buffer_copy,"ax",@progbits
+	.weak	halide_buffer_copy              # -- Begin function halide_buffer_copy
+	.p2align	4, 0x90
+	.type	halide_buffer_copy,@function
+halide_buffer_copy:                     # @halide_buffer_copy
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, %rbx
+	movq	%rdx, %r14
+	movq	%rsi, %r15
+	movq	%rdi, %r12
+	movq	_ZN6Halide7Runtime8Internal17device_copy_mutexE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	movq	8(%r15), %r13
+	testq	%r13, %r13
+	je	.LBB147_2
+# %bb.1:                                # %if.then.i
+	movq	120(%r13), %rax
+	callq	*(%rax)
+.LBB147_2:                              # %_ZN12_GLOBAL__N_19UseModuleC2EPK25halide_device_interface_t.exit
+	testq	%r14, %r14
+	je	.LBB147_3
+# %bb.4:                                # %if.then.i23
+	movq	120(%r14), %rax
+	callq	*(%rax)
+	movq	%r12, %rdi
+	movq	%r15, %rsi
+	movq	%r14, %rdx
+	movq	%rbx, %rcx
+	callq	halide_buffer_copy_already_locked@PLT
+	movl	%eax, %ebx
+	movq	120(%r14), %rax
+	callq	*8(%rax)
+	testq	%r13, %r13
+	je	.LBB147_7
+.LBB147_6:                              # %if.then.i27
+	movq	120(%r13), %rax
+	callq	*8(%rax)
+.LBB147_7:                              # %_ZN12_GLOBAL__N_19UseModuleD2Ev.exit28
+	movq	_ZN6Halide7Runtime8Internal17device_copy_mutexE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_unlock@PLT
+	movl	%ebx, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB147_3:                              # %_ZN12_GLOBAL__N_19UseModuleC2EPK25halide_device_interface_t.exit20.thread
+	movq	%r12, %rdi
+	movq	%r15, %rsi
+	xorl	%edx, %edx
+	movq	%rbx, %rcx
+	callq	halide_buffer_copy_already_locked@PLT
+	movl	%eax, %ebx
+	testq	%r13, %r13
+	jne	.LBB147_6
+	jmp	.LBB147_7
+.Lfunc_end147:
+	.size	halide_buffer_copy, .Lfunc_end147-halide_buffer_copy
+                                        # -- End function
+	.section	.text.halide_default_device_crop,"ax",@progbits
+	.weak	halide_default_device_crop      # -- Begin function halide_default_device_crop
+	.p2align	4, 0x90
+	.type	halide_default_device_crop,@function
+halide_default_device_crop:             # @halide_default_device_crop
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB148_1
+# %bb.2:                                # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.58(%rip), %rdx
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB148_3
+.LBB148_1:                              # %if.then.i
+	leaq	.L.str.58(%rip), %rdx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB148_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-40, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end148:
+	.size	halide_default_device_crop, .Lfunc_end148-halide_default_device_crop
+                                        # -- End function
+	.section	.text.halide_default_device_slice,"ax",@progbits
+	.weak	halide_default_device_slice     # -- Begin function halide_default_device_slice
+	.p2align	4, 0x90
+	.type	halide_default_device_slice,@function
+halide_default_device_slice:            # @halide_default_device_slice
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB149_1
+# %bb.2:                                # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.59(%rip), %rdx
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB149_3
+.LBB149_1:                              # %if.then.i
+	leaq	.L.str.59(%rip), %rdx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB149_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-40, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end149:
+	.size	halide_default_device_slice, .Lfunc_end149-halide_default_device_slice
+                                        # -- End function
+	.section	.text.halide_device_crop,"ax",@progbits
+	.weak	halide_device_crop              # -- Begin function halide_device_crop
+	.p2align	4, 0x90
+	.type	halide_device_crop,@function
+halide_device_crop:                     # @halide_device_crop
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movq	%rdx, %r14
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movq	_ZN6Halide7Runtime8Internal17device_copy_mutexE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	cmpq	$0, (%r15)
+	je	.LBB150_1
+# %bb.2:                                # %if.end
+	cmpq	$0, (%r14)
+	je	.LBB150_9
+# %bb.3:                                # %if.then3
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB150_4
+# %bb.6:                                # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.60(%rip), %rdx
+	jmp	.LBB150_7
+.LBB150_1:
+	xorl	%ebx, %ebx
+	jmp	.LBB150_14
+.LBB150_9:                              # %if.end4
+	movl	36(%r15), %eax
+	cmpl	36(%r14), %eax
+	jne	.LBB150_10
+# %bb.13:                               # %if.end9
+	movq	8(%r15), %rax
+	movq	120(%rax), %rax
+	callq	*(%rax)
+	movq	8(%r15), %rax
+	movq	120(%rax), %rax
+	movq	%rbx, %rdi
+	movq	%r15, %rsi
+	movq	%r14, %rdx
+	callq	*88(%rax)
+	movl	%eax, %ebx
+	jmp	.LBB150_14
+.LBB150_4:                              # %if.then.i
+	leaq	.L.str.60(%rip), %rdx
+	jmp	.LBB150_5
+.LBB150_10:                             # %if.then6
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB150_11
+# %bb.12:                               # %if.else.i63
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.61(%rip), %rdx
+.LBB150_7:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB150_8
+.LBB150_11:                             # %if.then.i56
+	leaq	.L.str.61(%rip), %rdx
+.LBB150_5:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB150_8:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-41, %ebx
+.LBB150_14:                             # %cleanup
+	movq	_ZN6Halide7Runtime8Internal17device_copy_mutexE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_unlock@PLT
+	movl	%ebx, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end150:
+	.size	halide_device_crop, .Lfunc_end150-halide_device_crop
+                                        # -- End function
+	.section	.text.halide_device_slice,"ax",@progbits
+	.weak	halide_device_slice             # -- Begin function halide_device_slice
+	.p2align	4, 0x90
+	.type	halide_device_slice,@function
+halide_device_slice:                    # @halide_device_slice
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%r8, %r14
+	movl	%ecx, %r15d
+	movl	%edx, %r12d
+	movq	%rsi, %r13
+	movq	%rdi, %rbx
+	movq	_ZN6Halide7Runtime8Internal17device_copy_mutexE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	cmpq	$0, (%r13)
+	je	.LBB151_1
+# %bb.2:                                # %if.end
+	cmpq	$0, (%r14)
+	je	.LBB151_9
+# %bb.3:                                # %if.then3
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB151_4
+# %bb.6:                                # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.60(%rip), %rdx
+	jmp	.LBB151_7
+.LBB151_1:
+	xorl	%ebx, %ebx
+	jmp	.LBB151_14
+.LBB151_9:                              # %if.end4
+	movl	36(%r14), %eax
+	incl	%eax
+	cmpl	%eax, 36(%r13)
+	jne	.LBB151_10
+# %bb.13:                               # %if.end9
+	movq	8(%r13), %rax
+	movq	120(%rax), %rax
+	callq	*(%rax)
+	movq	8(%r13), %rax
+	movq	120(%rax), %rax
+	movq	%rbx, %rdi
+	movq	%r13, %rsi
+	movl	%r12d, %edx
+	movl	%r15d, %ecx
+	movq	%r14, %r8
+	callq	*96(%rax)
+	movl	%eax, %ebx
+	jmp	.LBB151_14
+.LBB151_4:                              # %if.then.i
+	leaq	.L.str.60(%rip), %rdx
+	jmp	.LBB151_5
+.LBB151_10:                             # %if.then6
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB151_11
+# %bb.12:                               # %if.else.i63
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.64(%rip), %rdx
+.LBB151_7:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB151_8
+.LBB151_11:                             # %if.then.i56
+	leaq	.L.str.64(%rip), %rdx
+.LBB151_5:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB151_8:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-41, %ebx
+.LBB151_14:                             # %cleanup
+	movq	_ZN6Halide7Runtime8Internal17device_copy_mutexE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_unlock@PLT
+	movl	%ebx, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end151:
+	.size	halide_device_slice, .Lfunc_end151-halide_device_slice
+                                        # -- End function
+	.section	.text.halide_default_device_release_crop,"ax",@progbits
+	.weak	halide_default_device_release_crop # -- Begin function halide_default_device_release_crop
+	.p2align	4, 0x90
+	.type	halide_default_device_release_crop,@function
+halide_default_device_release_crop:     # @halide_default_device_release_crop
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	cmpq	$0, (%rsi)
+	je	.LBB152_1
+# %bb.2:                                # %if.end
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB152_3
+# %bb.4:                                # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.58(%rip), %rdx
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB152_5
+.LBB152_1:
+	xorl	%eax, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.LBB152_3:                              # %if.then.i
+	leaq	.L.str.58(%rip), %rdx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB152_5:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-40, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end152:
+	.size	halide_default_device_release_crop, .Lfunc_end152-halide_default_device_release_crop
+                                        # -- End function
+	.section	.text.halide_device_release_crop,"ax",@progbits
+	.weak	halide_device_release_crop      # -- Begin function halide_device_release_crop
+	.p2align	4, 0x90
+	.type	halide_device_release_crop,@function
+halide_device_release_crop:             # @halide_device_release_crop
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	cmpq	$0, (%rsi)
+	je	.LBB153_2
+# %bb.1:                                # %if.then
+	movq	%rsi, %rbx
+	movq	%rdi, %r14
+	movq	_ZN6Halide7Runtime8Internal17device_copy_mutexE@GOTPCREL(%rip), %r15
+	movq	%r15, %rdi
+	callq	halide_mutex_lock@PLT
+	movq	8(%rbx), %r12
+	movq	120(%r12), %rax
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	*104(%rax)
+	movl	%eax, %r14d
+	movq	$0, (%rbx)
+	movq	120(%r12), %rax
+	callq	*8(%rax)
+	movq	$0, 8(%rbx)
+	movq	%r15, %rdi
+	callq	halide_mutex_unlock@PLT
+	movl	%r14d, %eax
+	jmp	.LBB153_3
+.LBB153_2:                              # %return
+	xorl	%eax, %eax
+.LBB153_3:                              # %return
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end153:
+	.size	halide_device_release_crop, .Lfunc_end153-halide_device_release_crop
+                                        # -- End function
+	.section	.text.halide_float16_bits_to_float,"ax",@progbits
+	.weak	halide_float16_bits_to_float    # -- Begin function halide_float16_bits_to_float
+	.p2align	4, 0x90
+	.type	halide_float16_bits_to_float,@function
+halide_float16_bits_to_float:           # @halide_float16_bits_to_float
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movl	%edi, %ecx
+	shrl	$10, %ecx
+	andl	$31, %ecx
+	movl	%edi, %eax
+	andl	$1023, %eax                     # imm = 0x3FF
+	je	.LBB154_3
+# %bb.1:                                # %entry
+	testl	%ecx, %ecx
+	jne	.LBB154_3
+# %bb.2:                                # %if.then
+	xorl	%ecx, %ecx
+	lzcntl	%eax, %ecx
+	movl	%ecx, %edx
+	xorb	$31, %dl
+	btrl	%edx, %eax
+	movb	$23, %sil
+	subb	%dl, %sil
+	shlxl	%esi, %eax, %eax
+	shll	$23, %ecx
+	movl	$1124073472, %edx               # imm = 0x43000000
+	subl	%ecx, %edx
+	jmp	.LBB154_7
+.LBB154_3:                              # %if.else
+	shll	$13, %eax
+	testl	%ecx, %ecx
+	je	.LBB154_4
+# %bb.5:                                # %if.else18
+	movl	$2139095040, %edx               # imm = 0x7F800000
+	cmpl	$31, %ecx
+	je	.LBB154_7
+# %bb.6:                                # %if.else21
+	shll	$23, %ecx
+	addl	$939524096, %ecx                # imm = 0x38000000
+	movl	%ecx, %edx
+	jmp	.LBB154_7
+.LBB154_4:
+	xorl	%edx, %edx
+.LBB154_7:                              # %if.end23
+	orl	%eax, %edx
+	movswl	%di, %eax
+	andl	$-2147483648, %eax              # imm = 0x80000000
+	orl	%edx, %eax
+	vmovd	%eax, %xmm0
+	popq	%rbp
+	retq
+.Lfunc_end154:
+	.size	halide_float16_bits_to_float, .Lfunc_end154-halide_float16_bits_to_float
+                                        # -- End function
+	.section	.text.halide_float16_bits_to_double,"ax",@progbits
+	.weak	halide_float16_bits_to_double   # -- Begin function halide_float16_bits_to_double
+	.p2align	4, 0x90
+	.type	halide_float16_bits_to_double,@function
+halide_float16_bits_to_double:          # @halide_float16_bits_to_double
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	callq	halide_float16_bits_to_float@PLT
+	vcvtss2sd	%xmm0, %xmm0, %xmm0
+	popq	%rbp
+	retq
+.Lfunc_end155:
+	.size	halide_float16_bits_to_double, .Lfunc_end155-halide_float16_bits_to_double
+                                        # -- End function
+	.section	.text.halide_error_bounds_inference_call_failed,"ax",@progbits
+	.weak	halide_error_bounds_inference_call_failed # -- Begin function halide_error_bounds_inference_call_failed
+	.p2align	4, 0x90
+	.type	halide_error_bounds_inference_call_failed,@function
+halide_error_bounds_inference_call_failed: # @halide_error_bounds_inference_call_failed
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movl	%edx, %ebx
+	movq	%rsi, %r12
+	movq	%rdi, %r14
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r15
+	testq	%rax, %rax
+	je	.LBB156_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r15), %r13
+	movb	$0, 1023(%r15)
+	leaq	.L.str.112(%rip), %rdx
+	movq	%r15, %rdi
+	movq	%r13, %rsi
+	jmp	.LBB156_3
+.LBB156_1:                              # %entry.split
+	leaq	.L.str.112(%rip), %rdx
+	xorl	%r13d, %r13d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB156_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.1.113(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%ebx, %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	testq	%r15, %r15
+	je	.LBB156_4
+# %bb.5:                                # %if.else.i
+	subq	%r15, %rax
+	incq	%rax
+	movq	%r14, %rdi
+	movq	%r15, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r15, %rsi
+	jmp	.LBB156_6
+.LBB156_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB156_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%r14, %rdi
+	callq	halide_error@PLT
+	movq	%r15, %rdi
+	callq	free@PLT
+	movl	%ebx, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end156:
+	.size	halide_error_bounds_inference_call_failed, .Lfunc_end156-halide_error_bounds_inference_call_failed
+                                        # -- End function
+	.section	.text.halide_error_extern_stage_failed,"ax",@progbits
+	.weak	halide_error_extern_stage_failed # -- Begin function halide_error_extern_stage_failed
+	.p2align	4, 0x90
+	.type	halide_error_extern_stage_failed,@function
+halide_error_extern_stage_failed:       # @halide_error_extern_stage_failed
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movl	%edx, %ebx
+	movq	%rsi, %r12
+	movq	%rdi, %r14
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r15
+	testq	%rax, %rax
+	je	.LBB157_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r15), %r13
+	movb	$0, 1023(%r15)
+	leaq	.L.str.2.114(%rip), %rdx
+	movq	%r15, %rdi
+	movq	%r13, %rsi
+	jmp	.LBB157_3
+.LBB157_1:                              # %entry.split
+	leaq	.L.str.2.114(%rip), %rdx
+	xorl	%r13d, %r13d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB157_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.1.113(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%ebx, %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	testq	%r15, %r15
+	je	.LBB157_4
+# %bb.5:                                # %if.else.i
+	subq	%r15, %rax
+	incq	%rax
+	movq	%r14, %rdi
+	movq	%r15, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r15, %rsi
+	jmp	.LBB157_6
+.LBB157_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB157_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%r14, %rdi
+	callq	halide_error@PLT
+	movq	%r15, %rdi
+	callq	free@PLT
+	movl	%ebx, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end157:
+	.size	halide_error_extern_stage_failed, .Lfunc_end157-halide_error_extern_stage_failed
+                                        # -- End function
+	.section	.text.halide_error_explicit_bounds_too_small,"ax",@progbits
+	.weak	halide_error_explicit_bounds_too_small # -- Begin function halide_error_explicit_bounds_too_small
+	.p2align	4, 0x90
+	.type	halide_error_explicit_bounds_too_small,@function
+halide_error_explicit_bounds_too_small: # @halide_error_explicit_bounds_too_small
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$24, %rsp
+	movl	%r9d, -48(%rbp)                 # 4-byte Spill
+	movl	%r8d, -44(%rbp)                 # 4-byte Spill
+	movl	%ecx, %ebx
+	movq	%rdx, %r13
+	movq	%rsi, %r12
+	movq	%rdi, -56(%rbp)                 # 8-byte Spill
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB158_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r15
+	movb	$0, 1023(%r14)
+	leaq	.L.str.3.115(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r15, %rsi
+	jmp	.LBB158_3
+.LBB158_1:                              # %entry.split
+	leaq	.L.str.3.115(%rip), %rdx
+	xorl	%r15d, %r15d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB158_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.4.116(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.5.117(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%ebx, %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.6.118(%rip), %rbx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	movslq	-44(%rbp), %rdx                 # 4-byte Folded Reload
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.7.119(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	-48(%rbp), %rdx                 # 4-byte Folded Reload
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	movslq	16(%rbp), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.8.120(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB158_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	-56(%rbp), %rbx                 # 8-byte Reload
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB158_6
+.LBB158_4:
+	leaq	.L.str.29.165(%rip), %rsi
+	movq	-56(%rbp), %rbx                 # 8-byte Reload
+.LBB158_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-2, %eax
+	addq	$24, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end158:
+	.size	halide_error_explicit_bounds_too_small, .Lfunc_end158-halide_error_explicit_bounds_too_small
+                                        # -- End function
+	.section	.text.halide_error_bad_type,"ax",@progbits
+	.weak	halide_error_bad_type           # -- Begin function halide_error_bad_type
+	.p2align	4, 0x90
+	.type	halide_error_bad_type,@function
+halide_error_bad_type:                  # @halide_error_bad_type
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	subq	$32, %rsp
+	movq	%rsi, %r12
+	movq	%rdi, %rbx
+	movl	%edx, -56(%rbp)
+	movl	%ecx, -52(%rbp)
+	movl	$0, -48(%rbp)
+	movl	$0, -40(%rbp)
+	leaq	-48(%rbp), %rdi
+	leaq	-52(%rbp), %rsi
+	movl	$4, %edx
+	callq	memcpy@PLT
+	leaq	-40(%rbp), %rdi
+	leaq	-56(%rbp), %rsi
+	movl	$4, %edx
+	callq	memcpy@PLT
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB159_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r15
+	movb	$0, 1023(%r14)
+	movq	%r14, %rdi
+	movq	%r15, %rsi
+	jmp	.LBB159_3
+.LBB159_1:                              # %entry.split
+	xorl	%r15d, %r15d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB159_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.9.121(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	leaq	-48(%rbp), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_type_to_string@PLT
+	leaq	.L.str.10.122(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	leaq	-40(%rbp), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_type_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB159_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB159_6
+.LBB159_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB159_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-3, %eax
+	addq	$32, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end159:
+	.size	halide_error_bad_type, .Lfunc_end159-halide_error_bad_type
+                                        # -- End function
+	.section	.text.halide_error_bad_dimensions,"ax",@progbits
+	.weak	halide_error_bad_dimensions     # -- Begin function halide_error_bad_dimensions
+	.p2align	4, 0x90
+	.type	halide_error_bad_dimensions,@function
+halide_error_bad_dimensions:            # @halide_error_bad_dimensions
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movl	%ecx, %r13d
+	movl	%edx, %r15d
+	movq	%rsi, %rbx
+	movq	%rdi, -48(%rbp)                 # 8-byte Spill
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB160_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB160_3
+.LBB160_1:                              # %entry.split
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB160_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.11.123(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r13d, %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.12.124(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r15d, %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.13.125(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	movq	-48(%rbp), %rbx                 # 8-byte Reload
+	je	.LBB160_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB160_6
+.LBB160_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB160_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-43, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end160:
+	.size	halide_error_bad_dimensions, .Lfunc_end160-halide_error_bad_dimensions
+                                        # -- End function
+	.section	.text.halide_error_access_out_of_bounds,"ax",@progbits
+	.weak	halide_error_access_out_of_bounds # -- Begin function halide_error_access_out_of_bounds
+	.p2align	4, 0x90
+	.type	halide_error_access_out_of_bounds,@function
+halide_error_access_out_of_bounds:      # @halide_error_access_out_of_bounds
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$24, %rsp
+	movq	%rsi, %rbx
+	movq	%rdi, -56(%rbp)                 # 8-byte Spill
+	cmpl	%r9d, %ecx
+	jge	.LBB161_7
+# %bb.1:                                # %if.then
+	movl	%r9d, %r12d
+	movl	%ecx, %r15d
+	movl	%edx, -44(%rbp)                 # 4-byte Spill
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB161_2
+# %bb.3:                                # %if.then6.i
+	leaq	1023(%r14), %r13
+	movb	$0, 1023(%r14)
+	movq	%r14, %rdi
+	movq	%r13, %rsi
+	jmp	.LBB161_4
+.LBB161_7:                              # %if.else
+	movl	%r8d, %r13d
+	movl	16(%rbp), %r15d
+	cmpl	%r15d, %r8d
+	jle	.LBB161_14
+# %bb.8:                                # %if.then8
+	movl	%edx, -44(%rbp)                 # 4-byte Spill
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB161_9
+# %bb.10:                               # %if.then6.i58
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB161_11
+.LBB161_2:                              # %if.then.split
+	xorl	%r13d, %r13d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB161_4:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	leaq	.L.str.14.126(%rip), %rdx
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r15d, %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.15.127(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r12d, %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.16.128(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	-44(%rbp), %rdx                 # 4-byte Folded Reload
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB161_6
+.LBB161_12:
+	movq	-56(%rbp), %r15                 # 8-byte Reload
+	subq	%r14, %rax
+	incq	%rax
+	movq	%r15, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rbx
+	jmp	.LBB161_13
+.LBB161_9:                              # %if.then8.split
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB161_11:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit61
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.14.126(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r13d, %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.17.129(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r15d, %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.16.128(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	-44(%rbp), %rdx                 # 4-byte Folded Reload
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	testq	%r14, %r14
+	jne	.LBB161_12
+.LBB161_6:
+	xorl	%ebx, %ebx
+	leaq	.L.str.29.165(%rip), %r14
+	movq	-56(%rbp), %r15                 # 8-byte Reload
+.LBB161_13:                             # %if.end17.sink.split
+	movq	%r15, %rdi
+	movq	%r14, %rsi
+	callq	halide_error@PLT
+	movq	%rbx, %rdi
+	callq	free@PLT
+.LBB161_14:                             # %if.end17
+	movl	$-4, %eax
+	addq	$24, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end161:
+	.size	halide_error_access_out_of_bounds, .Lfunc_end161-halide_error_access_out_of_bounds
+                                        # -- End function
+	.section	.text.halide_error_buffer_allocation_too_large,"ax",@progbits
+	.weak	halide_error_buffer_allocation_too_large # -- Begin function halide_error_buffer_allocation_too_large
+	.p2align	4, 0x90
+	.type	halide_error_buffer_allocation_too_large,@function
+halide_error_buffer_allocation_too_large: # @halide_error_buffer_allocation_too_large
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, -48(%rbp)                 # 8-byte Spill
+	movq	%rdx, %r12
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB162_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r13
+	movb	$0, 1023(%r14)
+	leaq	.L.str.18.130(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r13, %rsi
+	jmp	.LBB162_3
+.LBB162_1:                              # %entry.split
+	leaq	.L.str.18.130(%rip), %rdx
+	xorl	%r13d, %r13d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB162_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.19.131(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r12, %rdx
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	leaq	.L.str.20.132(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	-48(%rbp), %rdx                 # 8-byte Reload
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB162_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB162_6
+.LBB162_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB162_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-5, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end162:
+	.size	halide_error_buffer_allocation_too_large, .Lfunc_end162-halide_error_buffer_allocation_too_large
+                                        # -- End function
+	.section	.text.halide_error_buffer_extents_negative,"ax",@progbits
+	.weak	halide_error_buffer_extents_negative # -- Begin function halide_error_buffer_extents_negative
+	.p2align	4, 0x90
+	.type	halide_error_buffer_extents_negative,@function
+halide_error_buffer_extents_negative:   # @halide_error_buffer_extents_negative
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movl	%ecx, %r15d
+	movl	%edx, %r13d
+	movq	%rsi, %rbx
+	movq	%rdi, -48(%rbp)                 # 8-byte Spill
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB163_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	leaq	.L.str.21.133(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB163_3
+.LBB163_1:                              # %entry.split
+	leaq	.L.str.21.133(%rip), %rdx
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB163_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.22.134(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r13d, %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.23.135(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r15d, %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.8.120(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB163_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	-48(%rbp), %rbx                 # 8-byte Reload
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB163_6
+.LBB163_4:
+	leaq	.L.str.29.165(%rip), %rsi
+	movq	-48(%rbp), %rbx                 # 8-byte Reload
+.LBB163_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-28, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end163:
+	.size	halide_error_buffer_extents_negative, .Lfunc_end163-halide_error_buffer_extents_negative
+                                        # -- End function
+	.section	.text.halide_error_buffer_extents_too_large,"ax",@progbits
+	.weak	halide_error_buffer_extents_too_large # -- Begin function halide_error_buffer_extents_too_large
+	.p2align	4, 0x90
+	.type	halide_error_buffer_extents_too_large,@function
+halide_error_buffer_extents_too_large:  # @halide_error_buffer_extents_too_large
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, -48(%rbp)                 # 8-byte Spill
+	movq	%rdx, %r12
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB164_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r13
+	movb	$0, 1023(%r14)
+	leaq	.L.str.24.136(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r13, %rsi
+	jmp	.LBB164_3
+.LBB164_1:                              # %entry.split
+	leaq	.L.str.24.136(%rip), %rdx
+	xorl	%r13d, %r13d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB164_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.19.131(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r12, %rdx
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.20.132(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	-48(%rbp), %rdx                 # 8-byte Reload
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB164_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB164_6
+.LBB164_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB164_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-6, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end164:
+	.size	halide_error_buffer_extents_too_large, .Lfunc_end164-halide_error_buffer_extents_too_large
+                                        # -- End function
+	.section	.text.halide_error_constraints_make_required_region_smaller,"ax",@progbits
+	.weak	halide_error_constraints_make_required_region_smaller # -- Begin function halide_error_constraints_make_required_region_smaller
+	.p2align	4, 0x90
+	.type	halide_error_constraints_make_required_region_smaller,@function
+halide_error_constraints_make_required_region_smaller: # @halide_error_constraints_make_required_region_smaller
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$24, %rsp
+	movl	%r9d, %r13d
+                                        # kill: def $r8d killed $r8d def $r8
+                                        # kill: def $ecx killed $ecx def $rcx
+	movl	%edx, %ebx
+	movq	%rsi, %r12
+	movq	%rdi, -56(%rbp)                 # 8-byte Spill
+	movl	16(%rbp), %eax
+	leal	-1(%r13,%rax), %eax
+	movl	%eax, -44(%rbp)                 # 4-byte Spill
+	movq	%rcx, -64(%rbp)                 # 8-byte Spill
+	leal	(%rcx,%r8), %eax
+	decl	%eax
+	movl	%eax, -48(%rbp)                 # 4-byte Spill
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB165_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r15
+	movb	$0, 1023(%r14)
+	leaq	.L.str.25.137(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r15, %rsi
+	jmp	.LBB165_3
+.LBB165_1:                              # %entry.split
+	leaq	.L.str.25.137(%rip), %rdx
+	xorl	%r15d, %r15d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB165_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.26.138(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%ebx, %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.27.139(%rip), %rbx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.28.140(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r13d, %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.6.118(%rip), %r12
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	movslq	-44(%rbp), %rdx                 # 4-byte Folded Reload
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.141(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	-64(%rbp), %rdx                 # 4-byte Folded Reload
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	movslq	-48(%rbp), %rdx                 # 4-byte Folded Reload
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.30.142(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB165_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	-56(%rbp), %rbx                 # 8-byte Reload
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB165_6
+.LBB165_4:
+	leaq	.L.str.29.165(%rip), %rsi
+	movq	-56(%rbp), %rbx                 # 8-byte Reload
+.LBB165_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-7, %eax
+	addq	$24, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end165:
+	.size	halide_error_constraints_make_required_region_smaller, .Lfunc_end165-halide_error_constraints_make_required_region_smaller
+                                        # -- End function
+	.section	.text.halide_error_constraint_violated,"ax",@progbits
+	.weak	halide_error_constraint_violated # -- Begin function halide_error_constraint_violated
+	.p2align	4, 0x90
+	.type	halide_error_constraint_violated,@function
+halide_error_constraint_violated:       # @halide_error_constraint_violated
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$24, %rsp
+	movl	%r8d, -44(%rbp)                 # 4-byte Spill
+	movq	%rcx, %r13
+	movl	%edx, %r15d
+	movq	%rsi, %rbx
+	movq	%rdi, -56(%rbp)                 # 8-byte Spill
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB166_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	leaq	.L.str.31.143(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB166_3
+.LBB166_1:                              # %entry.split
+	leaq	.L.str.31.143(%rip), %rdx
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB166_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.32.144(%rip), %rbx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	movslq	%r15d, %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.33.145(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	movslq	-44(%rbp), %rdx                 # 4-byte Folded Reload
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.8.120(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB166_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	-56(%rbp), %rbx                 # 8-byte Reload
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB166_6
+.LBB166_4:
+	leaq	.L.str.29.165(%rip), %rsi
+	movq	-56(%rbp), %rbx                 # 8-byte Reload
+.LBB166_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-8, %eax
+	addq	$24, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end166:
+	.size	halide_error_constraint_violated, .Lfunc_end166-halide_error_constraint_violated
+                                        # -- End function
+	.section	.text.halide_error_param_too_small_i64,"ax",@progbits
+	.weak	halide_error_param_too_small_i64 # -- Begin function halide_error_param_too_small_i64
+	.p2align	4, 0x90
+	.type	halide_error_param_too_small_i64,@function
+halide_error_param_too_small_i64:       # @halide_error_param_too_small_i64
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, -48(%rbp)                 # 8-byte Spill
+	movq	%rdx, %r12
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB167_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r13
+	movb	$0, 1023(%r14)
+	leaq	.L.str.34.146(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r13, %rsi
+	jmp	.LBB167_3
+.LBB167_1:                              # %entry.split
+	leaq	.L.str.34.146(%rip), %rdx
+	xorl	%r13d, %r13d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB167_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.19.131(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r12, %rdx
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.35(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	-48(%rbp), %rdx                 # 8-byte Reload
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB167_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB167_6
+.LBB167_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB167_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-9, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end167:
+	.size	halide_error_param_too_small_i64, .Lfunc_end167-halide_error_param_too_small_i64
+                                        # -- End function
+	.section	.text.halide_error_param_too_small_u64,"ax",@progbits
+	.weak	halide_error_param_too_small_u64 # -- Begin function halide_error_param_too_small_u64
+	.p2align	4, 0x90
+	.type	halide_error_param_too_small_u64,@function
+halide_error_param_too_small_u64:       # @halide_error_param_too_small_u64
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, -48(%rbp)                 # 8-byte Spill
+	movq	%rdx, %r12
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB168_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r13
+	movb	$0, 1023(%r14)
+	leaq	.L.str.34.146(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r13, %rsi
+	jmp	.LBB168_3
+.LBB168_1:                              # %entry.split
+	leaq	.L.str.34.146(%rip), %rdx
+	xorl	%r13d, %r13d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB168_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.19.131(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r12, %rdx
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	leaq	.L.str.35(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	-48(%rbp), %rdx                 # 8-byte Reload
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB168_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB168_6
+.LBB168_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB168_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-9, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end168:
+	.size	halide_error_param_too_small_u64, .Lfunc_end168-halide_error_param_too_small_u64
+                                        # -- End function
+	.section	.text.halide_error_param_too_small_f64,"ax",@progbits
+	.weak	halide_error_param_too_small_f64 # -- Begin function halide_error_param_too_small_f64
+	.p2align	4, 0x90
+	.type	halide_error_param_too_small_f64,@function
+halide_error_param_too_small_f64:       # @halide_error_param_too_small_f64
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	subq	$16, %rsp
+	vmovsd	%xmm1, -48(%rbp)                # 8-byte Spill
+	vmovsd	%xmm0, -40(%rbp)                # 8-byte Spill
+	movq	%rsi, %r12
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB169_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r15
+	movb	$0, 1023(%r14)
+	leaq	.L.str.34.146(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r15, %rsi
+	jmp	.LBB169_3
+.LBB169_1:                              # %entry.split
+	leaq	.L.str.34.146(%rip), %rdx
+	xorl	%r15d, %r15d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB169_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.19.131(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	vmovsd	-40(%rbp), %xmm0                # 8-byte Reload
+                                        # xmm0 = mem[0],zero
+	movl	$1, %edx
+	callq	halide_double_to_string@PLT
+	leaq	.L.str.35(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	vmovsd	-48(%rbp), %xmm0                # 8-byte Reload
+                                        # xmm0 = mem[0],zero
+	movl	$1, %edx
+	callq	halide_double_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB169_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB169_6
+.LBB169_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB169_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-9, %eax
+	addq	$16, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end169:
+	.size	halide_error_param_too_small_f64, .Lfunc_end169-halide_error_param_too_small_f64
+                                        # -- End function
+	.section	.text.halide_error_param_too_large_i64,"ax",@progbits
+	.weak	halide_error_param_too_large_i64 # -- Begin function halide_error_param_too_large_i64
+	.p2align	4, 0x90
+	.type	halide_error_param_too_large_i64,@function
+halide_error_param_too_large_i64:       # @halide_error_param_too_large_i64
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, -48(%rbp)                 # 8-byte Spill
+	movq	%rdx, %r12
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB170_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r13
+	movb	$0, 1023(%r14)
+	leaq	.L.str.34.146(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r13, %rsi
+	jmp	.LBB170_3
+.LBB170_1:                              # %entry.split
+	leaq	.L.str.34.146(%rip), %rdx
+	xorl	%r13d, %r13d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB170_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.19.131(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r12, %rdx
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.36(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	-48(%rbp), %rdx                 # 8-byte Reload
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB170_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB170_6
+.LBB170_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB170_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-10, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end170:
+	.size	halide_error_param_too_large_i64, .Lfunc_end170-halide_error_param_too_large_i64
+                                        # -- End function
+	.section	.text.halide_error_param_too_large_u64,"ax",@progbits
+	.weak	halide_error_param_too_large_u64 # -- Begin function halide_error_param_too_large_u64
+	.p2align	4, 0x90
+	.type	halide_error_param_too_large_u64,@function
+halide_error_param_too_large_u64:       # @halide_error_param_too_large_u64
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, -48(%rbp)                 # 8-byte Spill
+	movq	%rdx, %r12
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB171_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r13
+	movb	$0, 1023(%r14)
+	leaq	.L.str.34.146(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r13, %rsi
+	jmp	.LBB171_3
+.LBB171_1:                              # %entry.split
+	leaq	.L.str.34.146(%rip), %rdx
+	xorl	%r13d, %r13d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB171_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.19.131(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r12, %rdx
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	leaq	.L.str.36(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	-48(%rbp), %rdx                 # 8-byte Reload
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB171_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB171_6
+.LBB171_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB171_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-10, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end171:
+	.size	halide_error_param_too_large_u64, .Lfunc_end171-halide_error_param_too_large_u64
+                                        # -- End function
+	.section	.text.halide_error_param_too_large_f64,"ax",@progbits
+	.weak	halide_error_param_too_large_f64 # -- Begin function halide_error_param_too_large_f64
+	.p2align	4, 0x90
+	.type	halide_error_param_too_large_f64,@function
+halide_error_param_too_large_f64:       # @halide_error_param_too_large_f64
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	subq	$16, %rsp
+	vmovsd	%xmm1, -48(%rbp)                # 8-byte Spill
+	vmovsd	%xmm0, -40(%rbp)                # 8-byte Spill
+	movq	%rsi, %r12
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB172_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r15
+	movb	$0, 1023(%r14)
+	leaq	.L.str.34.146(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r15, %rsi
+	jmp	.LBB172_3
+.LBB172_1:                              # %entry.split
+	leaq	.L.str.34.146(%rip), %rdx
+	xorl	%r15d, %r15d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB172_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.19.131(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	vmovsd	-40(%rbp), %xmm0                # 8-byte Reload
+                                        # xmm0 = mem[0],zero
+	movl	$1, %edx
+	callq	halide_double_to_string@PLT
+	leaq	.L.str.36(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	vmovsd	-48(%rbp), %xmm0                # 8-byte Reload
+                                        # xmm0 = mem[0],zero
+	movl	$1, %edx
+	callq	halide_double_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB172_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB172_6
+.LBB172_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB172_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-10, %eax
+	addq	$16, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end172:
+	.size	halide_error_param_too_large_f64, .Lfunc_end172-halide_error_param_too_large_f64
+                                        # -- End function
+	.section	.text.halide_error_out_of_memory,"ax",@progbits
+	.weak	halide_error_out_of_memory      # -- Begin function halide_error_out_of_memory
+	.p2align	4, 0x90
+	.type	halide_error_out_of_memory,@function
+halide_error_out_of_memory:             # @halide_error_out_of_memory
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	leaq	.L.str.37(%rip), %rsi
+	callq	halide_error@PLT
+	movl	$-11, %eax
+	popq	%rbp
+	retq
+.Lfunc_end173:
+	.size	halide_error_out_of_memory, .Lfunc_end173-halide_error_out_of_memory
+                                        # -- End function
+	.section	.text.halide_error_buffer_argument_is_null,"ax",@progbits
+	.weak	halide_error_buffer_argument_is_null # -- Begin function halide_error_buffer_argument_is_null
+	.p2align	4, 0x90
+	.type	halide_error_buffer_argument_is_null,@function
+halide_error_buffer_argument_is_null:   # @halide_error_buffer_argument_is_null
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB174_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	leaq	.L.str.38(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB174_3
+.LBB174_1:                              # %entry.split
+	leaq	.L.str.38(%rip), %rdx
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB174_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.39(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB174_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB174_6
+.LBB174_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB174_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-12, %eax
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end174:
+	.size	halide_error_buffer_argument_is_null, .Lfunc_end174-halide_error_buffer_argument_is_null
+                                        # -- End function
+	.section	.text.halide_error_debug_to_file_failed,"ax",@progbits
+	.weak	halide_error_debug_to_file_failed # -- Begin function halide_error_debug_to_file_failed
+	.p2align	4, 0x90
+	.type	halide_error_debug_to_file_failed,@function
+halide_error_debug_to_file_failed:      # @halide_error_debug_to_file_failed
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movl	%ecx, -44(%rbp)                 # 4-byte Spill
+	movq	%rdx, %r12
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB175_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r13
+	movb	$0, 1023(%r14)
+	leaq	.L.str.40.147(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r13, %rsi
+	jmp	.LBB175_3
+.LBB175_1:                              # %entry.split
+	leaq	.L.str.40.147(%rip), %rdx
+	xorl	%r13d, %r13d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB175_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.41(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.42(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	-44(%rbp), %rdx                 # 4-byte Folded Reload
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB175_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB175_6
+.LBB175_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB175_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-13, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end175:
+	.size	halide_error_debug_to_file_failed, .Lfunc_end175-halide_error_debug_to_file_failed
+                                        # -- End function
+	.section	.text.halide_error_unaligned_host_ptr,"ax",@progbits
+	.weak	halide_error_unaligned_host_ptr # -- Begin function halide_error_unaligned_host_ptr
+	.p2align	4, 0x90
+	.type	halide_error_unaligned_host_ptr,@function
+halide_error_unaligned_host_ptr:        # @halide_error_unaligned_host_ptr
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movl	%edx, %r15d
+	movq	%rsi, %r13
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB176_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	leaq	.L.str.43(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB176_3
+.LBB176_1:                              # %entry.split
+	leaq	.L.str.43(%rip), %rdx
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB176_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.44(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r15d, %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.45(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB176_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB176_6
+.LBB176_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB176_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-24, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end176:
+	.size	halide_error_unaligned_host_ptr, .Lfunc_end176-halide_error_unaligned_host_ptr
+                                        # -- End function
+	.section	.text.halide_error_device_dirty_with_no_device_support,"ax",@progbits
+	.weak	halide_error_device_dirty_with_no_device_support # -- Begin function halide_error_device_dirty_with_no_device_support
+	.p2align	4, 0x90
+	.type	halide_error_device_dirty_with_no_device_support,@function
+halide_error_device_dirty_with_no_device_support: # @halide_error_device_dirty_with_no_device_support
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB177_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	leaq	.L.str.46(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB177_3
+.LBB177_1:                              # %entry.split
+	leaq	.L.str.46(%rip), %rdx
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB177_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.47(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.48(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB177_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB177_6
+.LBB177_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB177_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-44, %eax
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end177:
+	.size	halide_error_device_dirty_with_no_device_support, .Lfunc_end177-halide_error_device_dirty_with_no_device_support
+                                        # -- End function
+	.section	.text.halide_error_host_is_null,"ax",@progbits
+	.weak	halide_error_host_is_null       # -- Begin function halide_error_host_is_null
+	.p2align	4, 0x90
+	.type	halide_error_host_is_null,@function
+halide_error_host_is_null:              # @halide_error_host_is_null
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB178_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	leaq	.L.str.43(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB178_3
+.LBB178_1:                              # %entry.split
+	leaq	.L.str.43(%rip), %rdx
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB178_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.49(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB178_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB178_6
+.LBB178_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB178_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-34, %eax
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end178:
+	.size	halide_error_host_is_null, .Lfunc_end178-halide_error_host_is_null
+                                        # -- End function
+	.section	.text.halide_error_bad_fold,"ax",@progbits
+	.weak	halide_error_bad_fold           # -- Begin function halide_error_bad_fold
+	.p2align	4, 0x90
+	.type	halide_error_bad_fold,@function
+halide_error_bad_fold:                  # @halide_error_bad_fold
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, -48(%rbp)                 # 8-byte Spill
+	movq	%rdx, %r15
+	movq	%rsi, %r13
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB179_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	leaq	.L.str.50.148(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB179_3
+.LBB179_1:                              # %entry.split
+	leaq	.L.str.50.148(%rip), %rdx
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB179_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.51(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.52(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	-48(%rbp), %rdx                 # 8-byte Reload
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.30.142(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB179_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB179_6
+.LBB179_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB179_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-25, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end179:
+	.size	halide_error_bad_fold, .Lfunc_end179-halide_error_bad_fold
+                                        # -- End function
+	.section	.text.halide_error_bad_extern_fold,"ax",@progbits
+	.weak	halide_error_bad_extern_fold    # -- Begin function halide_error_bad_extern_fold
+	.p2align	4, 0x90
+	.type	halide_error_bad_extern_fold,@function
+halide_error_bad_extern_fold:           # @halide_error_bad_extern_fold
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$24, %rsp
+	movl	%r9d, %r12d
+	movl	%r8d, %ebx
+	movl	%ecx, %r15d
+	movq	%rsi, %r13
+	movq	%rdi, -48(%rbp)                 # 8-byte Spill
+	movl	16(%rbp), %eax
+	cmpl	%r9d, %ecx
+	jl	.LBB180_2
+# %bb.1:                                # %lor.lhs.false
+	leal	(%rbx,%r15), %ecx
+	addl	%r12d, %eax
+	cmpl	%eax, %ecx
+	jle	.LBB180_8
+.LBB180_2:                              # %if.then
+	movq	%r15, -64(%rbp)                 # 8-byte Spill
+	movl	%edx, %r15d
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	movq	%rbx, -56(%rbp)                 # 8-byte Spill
+	je	.LBB180_3
+# %bb.4:                                # %if.then6.i
+	leaq	1023(%r14), %rbx
+	movb	$0, 1023(%r14)
+	leaq	.L.str.53.149(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	jmp	.LBB180_5
+.LBB180_3:                              # %if.then.split
+	leaq	.L.str.53.149(%rip), %rdx
+	xorl	%ebx, %ebx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB180_5:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movslq	%r15d, %rdx
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.51(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.54(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movq	-64(%rbp), %r15                 # 8-byte Reload
+	movslq	%r15d, %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.55(%rip), %r13
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	movq	-56(%rbp), %rcx                 # 8-byte Reload
+	addl	%r15d, %ecx
+	decl	%ecx
+	movslq	%ecx, %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.56(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.57(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r12d, %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	movl	16(%rbp), %ecx
+	addl	%r12d, %ecx
+	decl	%ecx
+	movslq	%ecx, %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.58.150(%rip), %rdx
+	jmp	.LBB180_6
+.LBB180_8:                              # %if.else
+	movl	%ecx, -56(%rbp)                 # 4-byte Spill
+	movl	%edx, %r12d
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB180_9
+# %bb.10:                               # %if.then6.i106
+	leaq	1023(%r14), %rbx
+	movb	$0, 1023(%r14)
+	leaq	.L.str.53.149(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	jmp	.LBB180_11
+.LBB180_9:                              # %if.else.split
+	leaq	.L.str.53.149(%rip), %rdx
+	xorl	%ebx, %ebx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB180_11:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit109
+	callq	halide_string_to_string@PLT
+	movslq	%r12d, %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.51(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.54(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	%r15d, %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.55(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movl	-56(%rbp), %ecx                 # 4-byte Reload
+	decl	%ecx
+	movslq	%ecx, %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.56(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.59.151(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.60.152(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	movl	16(%rbp), %ecx
+	movslq	%ecx, %rdx
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.30.142(%rip), %rdx
+.LBB180_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB180_7
+# %bb.12:
+	movq	-48(%rbp), %r15                 # 8-byte Reload
+	subq	%r14, %rax
+	incq	%rax
+	movq	%r15, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rbx
+	jmp	.LBB180_13
+.LBB180_7:
+	xorl	%ebx, %ebx
+	leaq	.L.str.29.165(%rip), %r14
+	movq	-48(%rbp), %r15                 # 8-byte Reload
+.LBB180_13:                             # %if.end
+	movq	%r15, %rdi
+	movq	%r14, %rsi
+	callq	halide_error@PLT
+	movq	%rbx, %rdi
+	callq	free@PLT
+	movl	$-35, %eax
+	addq	$24, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end180:
+	.size	halide_error_bad_extern_fold, .Lfunc_end180-halide_error_bad_extern_fold
+                                        # -- End function
+	.section	.text.halide_error_fold_factor_too_small,"ax",@progbits
+	.weak	halide_error_fold_factor_too_small # -- Begin function halide_error_fold_factor_too_small
+	.p2align	4, 0x90
+	.type	halide_error_fold_factor_too_small,@function
+halide_error_fold_factor_too_small:     # @halide_error_fold_factor_too_small
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$24, %rsp
+	movl	%r9d, -44(%rbp)                 # 4-byte Spill
+	movq	%r8, -64(%rbp)                  # 8-byte Spill
+	movl	%ecx, %r13d
+	movq	%rdx, %rbx
+	movq	%rsi, %r15
+	movq	%rdi, -56(%rbp)                 # 8-byte Spill
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB181_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	leaq	.L.str.61.153(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB181_3
+.LBB181_1:                              # %entry.split
+	leaq	.L.str.61.153(%rip), %rdx
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB181_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movslq	%r13d, %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.62(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%rbx, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.51(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.63(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	-64(%rbp), %rdx                 # 8-byte Reload
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.32.144(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	-44(%rbp), %rdx                 # 4-byte Folded Reload
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.64.154(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB181_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	-56(%rbp), %rbx                 # 8-byte Reload
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB181_6
+.LBB181_4:
+	leaq	.L.str.29.165(%rip), %rsi
+	movq	-56(%rbp), %rbx                 # 8-byte Reload
+.LBB181_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-26, %eax
+	addq	$24, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end181:
+	.size	halide_error_fold_factor_too_small, .Lfunc_end181-halide_error_fold_factor_too_small
+                                        # -- End function
+	.section	.text.halide_error_requirement_failed,"ax",@progbits
+	.weak	halide_error_requirement_failed # -- Begin function halide_error_requirement_failed
+	.p2align	4, 0x90
+	.type	halide_error_requirement_failed,@function
+halide_error_requirement_failed:        # @halide_error_requirement_failed
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rdx, %r15
+	movq	%rsi, %r12
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB182_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r13
+	movb	$0, 1023(%r14)
+	leaq	.L.str.65(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r13, %rsi
+	jmp	.LBB182_3
+.LBB182_1:                              # %entry.split
+	leaq	.L.str.65(%rip), %rdx
+	xorl	%r13d, %r13d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB182_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r12, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.66(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r13, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB182_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB182_6
+.LBB182_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB182_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-27, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end182:
+	.size	halide_error_requirement_failed, .Lfunc_end182-halide_error_requirement_failed
+                                        # -- End function
+	.section	.text.halide_error_specialize_fail,"ax",@progbits
+	.weak	halide_error_specialize_fail    # -- Begin function halide_error_specialize_fail
+	.p2align	4, 0x90
+	.type	halide_error_specialize_fail,@function
+halide_error_specialize_fail:           # @halide_error_specialize_fail
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB183_1
+# %bb.2:                                # %if.else.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	leaq	.L.str.67(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB183_3
+.LBB183_1:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit.thread
+	leaq	.L.str.67(%rip), %rdx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	xorl	%esi, %esi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB183_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-31, %eax
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end183:
+	.size	halide_error_specialize_fail, .Lfunc_end183-halide_error_specialize_fail
+                                        # -- End function
+	.section	.text.halide_error_no_device_interface,"ax",@progbits
+	.weak	halide_error_no_device_interface # -- Begin function halide_error_no_device_interface
+	.p2align	4, 0x90
+	.type	halide_error_no_device_interface,@function
+halide_error_no_device_interface:       # @halide_error_no_device_interface
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB184_1
+# %bb.2:                                # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.68(%rip), %rdx
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB184_3
+.LBB184_1:                              # %if.then.i
+	leaq	.L.str.68(%rip), %rdx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB184_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-19, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end184:
+	.size	halide_error_no_device_interface, .Lfunc_end184-halide_error_no_device_interface
+                                        # -- End function
+	.section	.text.halide_error_device_interface_no_device,"ax",@progbits
+	.weak	halide_error_device_interface_no_device # -- Begin function halide_error_device_interface_no_device
+	.p2align	4, 0x90
+	.type	halide_error_device_interface_no_device,@function
+halide_error_device_interface_no_device: # @halide_error_device_interface_no_device
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB185_1
+# %bb.2:                                # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.69.159(%rip), %rdx
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB185_3
+.LBB185_1:                              # %if.then.i
+	leaq	.L.str.69.159(%rip), %rdx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB185_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-36, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end185:
+	.size	halide_error_device_interface_no_device, .Lfunc_end185-halide_error_device_interface_no_device
+                                        # -- End function
+	.section	.text.halide_error_host_and_device_dirty,"ax",@progbits
+	.weak	halide_error_host_and_device_dirty # -- Begin function halide_error_host_and_device_dirty
+	.p2align	4, 0x90
+	.type	halide_error_host_and_device_dirty,@function
+halide_error_host_and_device_dirty:     # @halide_error_host_and_device_dirty
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB186_1
+# %bb.2:                                # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.70(%rip), %rdx
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB186_3
+.LBB186_1:                              # %if.then.i
+	leaq	.L.str.70(%rip), %rdx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB186_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-37, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end186:
+	.size	halide_error_host_and_device_dirty, .Lfunc_end186-halide_error_host_and_device_dirty
+                                        # -- End function
+	.section	.text.halide_error_buffer_is_null,"ax",@progbits
+	.weak	halide_error_buffer_is_null     # -- Begin function halide_error_buffer_is_null
+	.p2align	4, 0x90
+	.type	halide_error_buffer_is_null,@function
+halide_error_buffer_is_null:            # @halide_error_buffer_is_null
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rsi, %r15
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB187_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	leaq	.L.str.71(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB187_3
+.LBB187_1:                              # %entry.split
+	leaq	.L.str.71(%rip), %rdx
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB187_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.72(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB187_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB187_6
+.LBB187_4:
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB187_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-38, %eax
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end187:
+	.size	halide_error_buffer_is_null, .Lfunc_end187-halide_error_buffer_is_null
+                                        # -- End function
+	.section	.text.halide_error_storage_bound_too_small,"ax",@progbits
+	.weak	halide_error_storage_bound_too_small # -- Begin function halide_error_storage_bound_too_small
+	.p2align	4, 0x90
+	.type	halide_error_storage_bound_too_small,@function
+halide_error_storage_bound_too_small:   # @halide_error_storage_bound_too_small
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$24, %rsp
+	movl	%r8d, -44(%rbp)                 # 4-byte Spill
+	movl	%ecx, %ebx
+	movq	%rdx, %r15
+	movq	%rsi, %r13
+	movq	%rdi, -56(%rbp)                 # 8-byte Spill
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB188_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%r14), %r12
+	movb	$0, 1023(%r14)
+	leaq	.L.str.73(%rip), %rdx
+	movq	%r14, %rdi
+	movq	%r12, %rsi
+	jmp	.LBB188_3
+.LBB188_1:                              # %entry.split
+	leaq	.L.str.73(%rip), %rdx
+	xorl	%r12d, %r12d
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.LBB188_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EEC2EPvPc.exit
+	callq	halide_string_to_string@PLT
+	movslq	%ebx, %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.62(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r15, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.51(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.74(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	movslq	-44(%rbp), %rdx                 # 4-byte Folded Reload
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	leaq	.L.str.64.154(%rip), %rdx
+	movq	%rax, %rdi
+	movq	%r12, %rsi
+	callq	halide_string_to_string@PLT
+	testq	%r14, %r14
+	je	.LBB188_4
+# %bb.5:                                # %if.else.i
+	subq	%r14, %rax
+	incq	%rax
+	movq	-56(%rbp), %rbx                 # 8-byte Reload
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	movq	%rax, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB188_6
+.LBB188_4:
+	leaq	.L.str.29.165(%rip), %rsi
+	movq	-56(%rbp), %rbx                 # 8-byte Reload
+.LBB188_6:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-45, %eax
+	addq	$24, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end188:
+	.size	halide_error_storage_bound_too_small, .Lfunc_end188-halide_error_storage_bound_too_small
+                                        # -- End function
+	.section	.text.halide_error_device_crop_failed,"ax",@progbits
+	.weak	halide_error_device_crop_failed # -- Begin function halide_error_device_crop_failed
+	.p2align	4, 0x90
+	.type	halide_error_device_crop_failed,@function
+halide_error_device_crop_failed:        # @halide_error_device_crop_failed
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %r14
+	testq	%rax, %rax
+	je	.LBB189_1
+# %bb.2:                                # %if.else.i
+	leaq	1023(%r14), %rsi
+	movb	$0, 1023(%r14)
+	leaq	.L.str.75(%rip), %rdx
+	movq	%r14, %rdi
+	callq	halide_string_to_string@PLT
+	subq	%r14, %rax
+	leaq	1(%rax), %rdx
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%r14, %rsi
+	jmp	.LBB189_3
+.LBB189_1:                              # %if.then.i
+	leaq	.L.str.75(%rip), %rdx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	callq	halide_string_to_string@PLT
+	leaq	.L.str.29.165(%rip), %rsi
+.LBB189_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE1ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	callq	halide_error@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movl	$-41, %eax
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end189:
+	.size	halide_error_device_crop_failed, .Lfunc_end189-halide_error_device_crop_failed
+                                        # -- End function
+	.section	.text.halide_profiler_shutdown,"ax",@progbits
+	.weak	halide_profiler_shutdown        # -- Begin function halide_profiler_shutdown
+	.p2align	4, 0x90
+	.type	halide_profiler_shutdown,@function
+halide_profiler_shutdown:               # @halide_profiler_shutdown
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%rbx
+	pushq	%rax
+	callq	halide_profiler_get_state@PLT
+	movq	40(%rax), %rdi
+	testq	%rdi, %rdi
+	je	.LBB190_1
+# %bb.2:                                # %if.end
+	movq	%rax, %rbx
+	movl	$-2, 16(%rax)
+	callq	halide_join_thread@PLT
+	movq	$0, 40(%rbx)
+	movl	$-1, 16(%rbx)
+	xorl	%edi, %edi
+	movq	%rbx, %rsi
+	callq	halide_profiler_report_unlocked@PLT
+	movq	%rbx, %rdi
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%rbp
+	jmp	halide_profiler_reset_unlocked@PLT # TAILCALL
+.LBB190_1:                              # %cleanup
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%rbp
+	retq
+.Lfunc_end190:
+	.size	halide_profiler_shutdown, .Lfunc_end190-halide_profiler_shutdown
+                                        # -- End function
+	.section	.text.halide_profiler_get_state,"ax",@progbits
+	.weak	halide_profiler_get_state       # -- Begin function halide_profiler_get_state
+	.p2align	4, 0x90
+	.type	halide_profiler_get_state,@function
+halide_profiler_get_state:              # @halide_profiler_get_state
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	leaq	_ZZ25halide_profiler_get_stateE1s(%rip), %rax
+	popq	%rbp
+	retq
+.Lfunc_end191:
+	.size	halide_profiler_get_state, .Lfunc_end191-halide_profiler_get_state
+                                        # -- End function
+	.section	.rodata.cst4,"aM",@progbits,4
+	.p2align	2, 0x0                          # -- Begin function halide_profiler_report_unlocked
+.LCPI192_0:
+	.long	0x49742400                      # float 1.0E+6
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4, 0x0
+.LCPI192_1:
+	.long	1127219200                      # 0x43300000
+	.long	1160773632                      # 0x45300000
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+.LCPI192_2:
+	.quad	0x4330000000000000              # double 4503599627370496
+	.quad	0x4530000000000000              # double 1.9342813113834067E+25
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3, 0x0
+.LCPI192_3:
+	.quad	0x3ddb7cdfd9d7bdbb              # double 1.0E-10
+	.section	.text.halide_profiler_report_unlocked,"ax",@progbits
+	.weak	halide_profiler_report_unlocked
+	.p2align	4, 0x90
+	.type	halide_profiler_report_unlocked,@function
+halide_profiler_report_unlocked:        # @halide_profiler_report_unlocked
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$72, %rsp
+	movq	%rsi, %r12
+	movq	%rdi, -80(%rbp)                 # 8-byte Spill
+	movl	$1024, %edi                     # imm = 0x400
+	callq	malloc@PLT
+	movq	%rax, %rbx
+	testq	%rax, %rax
+	je	.LBB192_1
+# %bb.2:                                # %if.then6.i
+	leaq	1023(%rbx), %r15
+	movb	$0, 1023(%rbx)
+	jmp	.LBB192_3
+.LBB192_1:
+	xorl	%r15d, %r15d
+.LBB192_3:                              # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE2ELy1024EEC2EPvPc.exit
+	movq	24(%r12), %rax
+	movq	%rbx, %r12
+	movq	%rax, -48(%rbp)                 # 8-byte Spill
+	testq	%rax, %rax
+	je	.LBB192_8
+# %bb.4:                                # %for.body.lr.ph
+	movl	$1, %eax
+	subq	%rbx, %rax
+	movq	%rax, -104(%rbp)                # 8-byte Spill
+	leaq	.L.str.20.179(%rip), %r13
+	movq	%rbx, %r12
+	jmp	.LBB192_5
+	.p2align	4, 0x90
+.LBB192_77:                             # %cleanup181
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	64(%rax), %rax
+	movq	%rax, -48(%rbp)                 # 8-byte Spill
+	testq	%rax, %rax
+	je	.LBB192_8
+.LBB192_5:                              # %for.body
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB192_25 Depth 2
+                                        #     Child Loop BB192_28 Depth 2
+                                        #       Child Loop BB192_35 Depth 3
+                                        #       Child Loop BB192_43 Depth 3
+                                        #       Child Loop BB192_50 Depth 3
+                                        #       Child Loop BB192_56 Depth 3
+                                        #       Child Loop BB192_61 Depth 3
+                                        #       Child Loop BB192_63 Depth 3
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	cmpl	$0, 80(%rax)
+	je	.LBB192_77
+# %bb.6:                                # %if.end
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rax
+	testq	%rax, %rax
+	js	.LBB192_7
+# %bb.12:                               # %if.end
+                                        #   in Loop: Header=BB192_5 Depth=1
+	vcvtsi2ss	%rax, %xmm4, %xmm0
+	vdivss	.LCPI192_0(%rip), %xmm0, %xmm0
+	testq	%rbx, %rbx
+	vmovss	%xmm0, -64(%rbp)                # 4-byte Spill
+	je	.LBB192_14
+.LBB192_15:                             # %if.then.i260
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movb	$0, (%rbx)
+	movq	-48(%rbp), %r14                 # 8-byte Reload
+	movq	32(%r14), %rcx
+	movq	40(%r14), %rax
+	movq	%rax, -72(%rbp)                 # 8-byte Spill
+	movq	%rcx, -96(%rbp)                 # 8-byte Spill
+	cmpq	%rax, %rcx
+	sete	%r12b
+	movq	48(%r14), %rdx
+	movq	%rbx, %rdi
+	jmp	.LBB192_16
+	.p2align	4, 0x90
+.LBB192_7:                              #   in Loop: Header=BB192_5 Depth=1
+	movq	%rax, %rcx
+	shrq	%rcx
+	andl	$1, %eax
+	orq	%rcx, %rax
+	vcvtsi2ss	%rax, %xmm4, %xmm0
+	vaddss	%xmm0, %xmm0, %xmm0
+	vdivss	.LCPI192_0(%rip), %xmm0, %xmm0
+	testq	%rbx, %rbx
+	vmovss	%xmm0, -64(%rbp)                # 4-byte Spill
+	jne	.LBB192_15
+.LBB192_14:                             # %if.end.split
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movq	-48(%rbp), %r14                 # 8-byte Reload
+	movq	32(%r14), %rcx
+	movq	40(%r14), %rax
+	movq	%rax, -72(%rbp)                 # 8-byte Spill
+	movq	%rcx, -96(%rbp)                 # 8-byte Spill
+	cmpq	%rax, %rcx
+	sete	%r12b
+	movq	48(%r14), %rdx
+	xorl	%edi, %edi
+.LBB192_16:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE2ELy1024EE5clearEv.exit
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movb	%r12b, -49(%rbp)                # 1-byte Spill
+	movq	%r15, %rsi
+	leaq	.L.str.7.166(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.8.167(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	vmovss	-64(%rbp), %xmm0                # 4-byte Reload
+                                        # xmm0 = mem[0],zero,zero,zero
+	vcvtss2sd	%xmm0, %xmm0, %xmm0
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	xorl	%edx, %edx
+	callq	halide_double_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.9.168(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.10.169(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movslq	84(%r14), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.11.170(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movslq	80(%r14), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.12.171(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	%r14, -48(%rbp)                 # 8-byte Spill
+	vcvtsi2ssl	80(%r14), %xmm4, %xmm0
+	vmovss	-64(%rbp), %xmm1                # 4-byte Reload
+                                        # xmm1 = mem[0],zero,zero,zero
+	vdivss	%xmm0, %xmm1, %xmm0
+	vcvtss2sd	%xmm0, %xmm0, %xmm0
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	xorl	%edx, %edx
+	callq	halide_double_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.13.172(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	testb	%r12b, %r12b
+	jne	.LBB192_18
+# %bb.17:                               # %if.then24
+                                        #   in Loop: Header=BB192_5 Depth=1
+	vmovsd	-96(%rbp), %xmm0                # 8-byte Reload
+                                        # xmm0 = mem[0],zero
+	vmovapd	.LCPI192_1(%rip), %xmm2         # xmm2 = [1127219200,1160773632,0,0]
+	vunpcklps	%xmm2, %xmm0, %xmm0     # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+	vmovapd	.LCPI192_2(%rip), %xmm3         # xmm3 = [4.503599627370496E+15,1.9342813113834067E+25]
+	vsubpd	%xmm3, %xmm0, %xmm0
+	vpermilpd	$1, %xmm0, %xmm1        # xmm1 = xmm0[1,0]
+	vaddsd	%xmm0, %xmm1, %xmm0
+	vmovsd	-72(%rbp), %xmm1                # 8-byte Reload
+                                        # xmm1 = mem[0],zero
+	vunpcklps	%xmm2, %xmm1, %xmm1     # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+	vsubpd	%xmm3, %xmm1, %xmm1
+	vpermilpd	$1, %xmm1, %xmm2        # xmm2 = xmm1[1,0]
+	vaddsd	%xmm1, %xmm2, %xmm1
+	vaddsd	.LCPI192_3(%rip), %xmm1, %xmm1
+	vdivsd	%xmm1, %xmm0, %xmm0
+	vcvtsd2ss	%xmm0, %xmm0, %xmm0
+	vmovss	%xmm0, -64(%rbp)                # 4-byte Spill
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.14.173(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	vmovss	-64(%rbp), %xmm0                # 4-byte Reload
+                                        # xmm0 = mem[0],zero,zero,zero
+	vcvtss2sd	%xmm0, %xmm0, %xmm0
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	xorl	%edx, %edx
+	callq	halide_double_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.7.166(%rip), %rdx
+	callq	halide_string_to_string@PLT
+.LBB192_18:                             # %if.end28
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movq	-80(%rbp), %r14                 # 8-byte Reload
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.15.174(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	-48(%rbp), %r12                 # 8-byte Reload
+	movslq	88(%r12), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.16.175(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	16(%r12), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.17.176(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %r12
+	leaq	.L.str.29.165(%rip), %rsi
+	testq	%rbx, %rbx
+	je	.LBB192_20
+# %bb.19:                               # %if.then.i334
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movq	-104(%rbp), %rax                # 8-byte Reload
+	leaq	(%rax,%r12), %rdx
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%rbx, %rsi
+.LBB192_20:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE2ELy1024EE3strEv.exit
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movq	%r14, %rdi
+	callq	halide_print@PLT
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	cmpq	$0, (%rax)
+	jne	.LBB192_26
+# %bb.21:                               # %lor.end
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	cmpq	$0, 24(%rax)
+	je	.LBB192_22
+.LBB192_26:                             # %for.cond53.critedge
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	cmpl	$0, 72(%rax)
+	jle	.LBB192_77
+# %bb.27:                               # %for.body57.lr.ph
+                                        #   in Loop: Header=BB192_5 Depth=1
+	xorl	%ecx, %ecx
+	jmp	.LBB192_28
+	.p2align	4, 0x90
+.LBB192_75:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE2ELy1024EE3strEv.exit456
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	%r14, %rdi
+	callq	halide_print@PLT
+	movq	-96(%rbp), %rcx                 # 8-byte Reload
+.LBB192_76:                             # %cleanup172
+                                        #   in Loop: Header=BB192_28 Depth=2
+	incq	%rcx
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movslq	72(%rax), %rax
+	cmpq	%rax, %rcx
+	jge	.LBB192_77
+.LBB192_28:                             # %for.body57
+                                        #   Parent Loop BB192_5 Depth=1
+                                        # =>  This Loop Header: Depth=2
+                                        #       Child Loop BB192_35 Depth 3
+                                        #       Child Loop BB192_43 Depth 3
+                                        #       Child Loop BB192_50 Depth 3
+                                        #       Child Loop BB192_56 Depth 3
+                                        #       Child Loop BB192_61 Depth 3
+                                        #       Child Loop BB192_63 Depth 3
+	testq	%rbx, %rbx
+	je	.LBB192_30
+# %bb.29:                               # %if.then.i337
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movb	$0, (%rbx)
+.LBB192_30:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE2ELy1024EE5clearEv.exit339
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	56(%rax), %r14
+	leaq	(%rcx,%rcx,8), %rdx
+	leaq	(%r14,%rdx,8), %rsi
+	testq	%rcx, %rcx
+	jne	.LBB192_32
+# %bb.31:                               # %land.lhs.true
+                                        #   in Loop: Header=BB192_28 Depth=2
+	cmpq	$0, (%rsi)
+	movq	%rbx, %r12
+	je	.LBB192_76
+.LBB192_32:                             # %if.end66
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	%rsi, -88(%rbp)                 # 8-byte Spill
+	movq	%rcx, -96(%rbp)                 # 8-byte Spill
+	movq	%rbx, %rdi
+	movq	%rdx, %r12
+	movq	%r15, %rsi
+	leaq	.L.str.18.177(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	%r14, -64(%rbp)                 # 8-byte Spill
+	movq	%r12, -72(%rbp)                 # 8-byte Spill
+	movq	56(%r14,%r12,8), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.19.178(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rcx
+	subq	%rbx, %rcx
+	cmpq	$24, %rcx
+	ja	.LBB192_33
+# %bb.34:                               # %while.body.preheader
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	-88(%rbp), %r12                 # 8-byte Reload
+	.p2align	4, 0x90
+.LBB192_35:                             # %while.body
+                                        #   Parent Loop BB192_5 Depth=1
+                                        #     Parent Loop BB192_28 Depth=2
+                                        # =>    This Inner Loop Header: Depth=3
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rcx
+	subq	%rbx, %rcx
+	cmpq	$25, %rcx
+	jb	.LBB192_35
+# %bb.36:                               # %while.end
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	(%r12), %rcx
+	testq	%rcx, %rcx
+	js	.LBB192_37
+.LBB192_38:                             # %while.end
+                                        #   in Loop: Header=BB192_28 Depth=2
+	vcvtsi2ss	%rcx, %xmm4, %xmm0
+	jmp	.LBB192_39
+	.p2align	4, 0x90
+.LBB192_33:                             #   in Loop: Header=BB192_28 Depth=2
+	movq	-88(%rbp), %r12                 # 8-byte Reload
+	movq	(%r12), %rcx
+	testq	%rcx, %rcx
+	jns	.LBB192_38
+.LBB192_37:                             #   in Loop: Header=BB192_28 Depth=2
+	movq	%rcx, %rdx
+	shrq	%rdx
+	andl	$1, %ecx
+	orq	%rdx, %rcx
+	vcvtsi2ss	%rcx, %xmm4, %xmm0
+	vaddss	%xmm0, %xmm0, %xmm0
+.LBB192_39:                             # %while.end
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	-48(%rbp), %rcx                 # 8-byte Reload
+	vcvtsi2ssl	80(%rcx), %xmm4, %xmm1
+	vmulss	.LCPI192_0(%rip), %xmm1, %xmm1
+	vdivss	%xmm1, %xmm0, %xmm0
+	vcvtss2sd	%xmm0, %xmm0, %xmm0
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	xorl	%edx, %edx
+	callq	halide_double_to_string@PLT
+	testq	%rax, %rax
+	je	.LBB192_40
+# %bb.41:                               # %if.then.i359
+                                        #   in Loop: Header=BB192_28 Depth=2
+	addq	$-3, %rax
+	cmpq	%rbx, %rax
+	cmovbq	%rbx, %rax
+	movb	$0, (%rax)
+	jmp	.LBB192_42
+	.p2align	4, 0x90
+.LBB192_40:                             #   in Loop: Header=BB192_28 Depth=2
+	xorl	%eax, %eax
+.LBB192_42:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE2ELy1024EE5eraseEi.exit
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.21.180(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	subq	%rbx, %rax
+	cmpq	$34, %rax
+	ja	.LBB192_44
+	.p2align	4, 0x90
+.LBB192_43:                             # %while.body86
+                                        #   Parent Loop BB192_5 Depth=1
+                                        #     Parent Loop BB192_28 Depth=2
+                                        # =>    This Inner Loop Header: Depth=3
+	movq	%r15, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	subq	%rbx, %rax
+	cmpq	$35, %rax
+	jb	.LBB192_43
+.LBB192_44:                             # %while.end88
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	(%rax), %rcx
+	testq	%rcx, %rcx
+	je	.LBB192_45
+# %bb.46:                               # %if.then91
+                                        #   in Loop: Header=BB192_28 Depth=2
+	imulq	$100, (%r12), %rax
+	movq	%rax, %rdx
+	orq	%rcx, %rdx
+	shrq	$32, %rdx
+	je	.LBB192_47
+# %bb.48:                               #   in Loop: Header=BB192_28 Depth=2
+	xorl	%edx, %edx
+	divq	%rcx
+	movq	%rax, %r12
+	jmp	.LBB192_49
+	.p2align	4, 0x90
+.LBB192_45:                             #   in Loop: Header=BB192_28 Depth=2
+	xorl	%r12d, %r12d
+	jmp	.LBB192_49
+	.p2align	4, 0x90
+.LBB192_47:                             #   in Loop: Header=BB192_28 Depth=2
+                                        # kill: def $eax killed $eax killed $rax
+	xorl	%edx, %edx
+	divl	%ecx
+	movl	%eax, %r12d
+.LBB192_49:                             # %if.end97
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	%r15, %rsi
+	leaq	.L.str.22.181(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movslq	%r12d, %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.23.182(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rcx
+	subq	%rbx, %rcx
+	cmpq	$42, %rcx
+	ja	.LBB192_51
+	.p2align	4, 0x90
+.LBB192_50:                             # %while.body105
+                                        #   Parent Loop BB192_5 Depth=1
+                                        #     Parent Loop BB192_28 Depth=2
+                                        # =>    This Inner Loop Header: Depth=3
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rcx
+	subq	%rbx, %rcx
+	cmpq	$43, %rcx
+	jb	.LBB192_50
+.LBB192_51:                             # %while.end107
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movl	$58, %r12d
+	cmpb	$0, -49(%rbp)                   # 1-byte Folded Reload
+	jne	.LBB192_57
+# %bb.52:                               # %if.then109
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	-64(%rbp), %rcx                 # 8-byte Reload
+	movq	-72(%rbp), %rdx                 # 8-byte Reload
+	vmovsd	40(%rcx,%rdx,8), %xmm0          # xmm0 = mem[0],zero
+	vmovapd	.LCPI192_1(%rip), %xmm2         # xmm2 = [1127219200,1160773632,0,0]
+	vunpcklps	%xmm2, %xmm0, %xmm0     # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+	vmovapd	.LCPI192_2(%rip), %xmm3         # xmm3 = [4.503599627370496E+15,1.9342813113834067E+25]
+	vsubpd	%xmm3, %xmm0, %xmm0
+	vpermilpd	$1, %xmm0, %xmm1        # xmm1 = xmm0[1,0]
+	vaddsd	%xmm0, %xmm1, %xmm0
+	vmovsd	48(%rcx,%rdx,8), %xmm1          # xmm1 = mem[0],zero
+	vunpcklps	%xmm2, %xmm1, %xmm1     # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+	vsubpd	%xmm3, %xmm1, %xmm1
+	vpermilpd	$1, %xmm1, %xmm2        # xmm2 = xmm1[1,0]
+	vaddsd	%xmm1, %xmm2, %xmm1
+	vaddsd	.LCPI192_3(%rip), %xmm1, %xmm1
+	vdivsd	%xmm1, %xmm0, %xmm0
+	vcvtsd2ss	%xmm0, %xmm0, %xmm0
+	vmovss	%xmm0, -88(%rbp)                # 4-byte Spill
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.24.183(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	vmovss	-88(%rbp), %xmm0                # 4-byte Reload
+                                        # xmm0 = mem[0],zero,zero,zero
+	vcvtss2sd	%xmm0, %xmm0, %xmm0
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	xorl	%edx, %edx
+	callq	halide_double_to_string@PLT
+	testq	%rax, %rax
+	je	.LBB192_53
+# %bb.54:                               # %if.then.i397
+                                        #   in Loop: Header=BB192_28 Depth=2
+	addq	$-3, %rax
+	cmpq	%rbx, %rax
+	cmovbq	%rbx, %rax
+	movb	$0, (%rax)
+	jmp	.LBB192_55
+.LBB192_53:                             #   in Loop: Header=BB192_28 Depth=2
+	xorl	%eax, %eax
+.LBB192_55:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE2ELy1024EE5eraseEi.exit398
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	-64(%rbp), %rdx                 # 8-byte Reload
+	movq	-72(%rbp), %rsi                 # 8-byte Reload
+	movq	%rax, %rcx
+	subq	%rbx, %rcx
+	movl	$73, %r12d
+	cmpq	$57, %rcx
+	ja	.LBB192_58
+	.p2align	4, 0x90
+.LBB192_56:                             # %while.body124
+                                        #   Parent Loop BB192_5 Depth=1
+                                        #     Parent Loop BB192_28 Depth=2
+                                        # =>    This Inner Loop Header: Depth=3
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rcx
+	subq	%rbx, %rcx
+	cmpq	$58, %rcx
+	jb	.LBB192_56
+.LBB192_57:                             #   in Loop: Header=BB192_28 Depth=2
+	movq	-64(%rbp), %rdx                 # 8-byte Reload
+	movq	-72(%rbp), %rsi                 # 8-byte Reload
+.LBB192_58:                             # %if.end127
+                                        #   in Loop: Header=BB192_28 Depth=2
+	cmpq	$0, 16(%rdx,%rsi,8)
+	je	.LBB192_71
+# %bb.59:                               # %if.then130
+                                        #   in Loop: Header=BB192_28 Depth=2
+	leaq	(%rdx,%rsi,8), %r14
+	addq	$16, %r14
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.25.184(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	(%r14), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+	.p2align	4, 0x90
+.LBB192_61:                             # %while.body138
+                                        #   Parent Loop BB192_5 Depth=1
+                                        #     Parent Loop BB192_28 Depth=2
+                                        # =>    This Inner Loop Header: Depth=3
+	movq	%rax, %rcx
+	subq	%rbx, %rcx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	cmpq	%r12, %rcx
+	jae	.LBB192_62
+# %bb.60:                               # %while.body138
+                                        #   in Loop: Header=BB192_61 Depth=3
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	jmp	.LBB192_61
+	.p2align	4, 0x90
+.LBB192_62:                             # %while.end140
+                                        #   in Loop: Header=BB192_28 Depth=2
+	leaq	.L.str.26.185(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	-64(%rbp), %rcx                 # 8-byte Reload
+	movq	-72(%rbp), %rdx                 # 8-byte Reload
+	movslq	64(%rcx,%rdx,8), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	%rax, %rdi
+	addq	$15, %r12
+	subq	%rbx, %rax
+	cmpq	%r12, %rax
+	jae	.LBB192_65
+	.p2align	4, 0x90
+.LBB192_63:                             # %while.body148
+                                        #   Parent Loop BB192_5 Depth=1
+                                        #     Parent Loop BB192_28 Depth=2
+                                        # =>    This Inner Loop Header: Depth=3
+	movq	%r15, %rsi
+	movq	%r13, %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %rdi
+	subq	%rbx, %rax
+	cmpq	%r12, %rax
+	jb	.LBB192_63
+.LBB192_65:                             # %while.end150
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	-64(%rbp), %rdx                 # 8-byte Reload
+	movq	-72(%rbp), %rsi                 # 8-byte Reload
+	leaq	(%rdx,%rsi,8), %rax
+	addq	$64, %rax
+	movslq	(%rax), %rcx
+	testq	%rcx, %rcx
+	je	.LBB192_66
+# %bb.67:                               # %if.then153
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	24(%rdx,%rsi,8), %rax
+	movq	%rax, %rdx
+	orq	%rcx, %rdx
+	shrq	$32, %rdx
+	je	.LBB192_68
+# %bb.69:                               #   in Loop: Header=BB192_28 Depth=2
+	xorl	%edx, %edx
+	divq	%rcx
+	movq	%rax, %r12
+	jmp	.LBB192_70
+	.p2align	4, 0x90
+.LBB192_66:                             #   in Loop: Header=BB192_28 Depth=2
+	xorl	%r12d, %r12d
+	jmp	.LBB192_70
+.LBB192_68:                             #   in Loop: Header=BB192_28 Depth=2
+                                        # kill: def $eax killed $eax killed $rax
+	xorl	%edx, %edx
+	divl	%ecx
+	movl	%eax, %r12d
+.LBB192_70:                             # %if.end159
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	%r15, %rsi
+	leaq	.L.str.27.186(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movslq	%r12d, %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_int64_to_string@PLT
+	movq	-64(%rbp), %rdx                 # 8-byte Reload
+	movq	-72(%rbp), %rsi                 # 8-byte Reload
+.LBB192_71:                             # %if.end162
+                                        #   in Loop: Header=BB192_28 Depth=2
+	cmpq	$0, 32(%rdx,%rsi,8)
+	je	.LBB192_73
+# %bb.72:                               # %if.then165
+                                        #   in Loop: Header=BB192_28 Depth=2
+	leaq	(%rdx,%rsi,8), %r14
+	addq	$32, %r14
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.28.187(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	(%r14), %rdx
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	movl	$1, %ecx
+	callq	halide_uint64_to_string@PLT
+.LBB192_73:                             # %if.end169
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	%rax, %rdi
+	movq	%r15, %rsi
+	leaq	.L.str.7.166(%rip), %rdx
+	callq	halide_string_to_string@PLT
+	movq	%rax, %r12
+	leaq	.L.str.29.165(%rip), %rsi
+	testq	%rbx, %rbx
+	movq	-80(%rbp), %r14                 # 8-byte Reload
+	je	.LBB192_75
+# %bb.74:                               # %if.then.i454
+                                        #   in Loop: Header=BB192_28 Depth=2
+	movq	-104(%rbp), %rax                # 8-byte Reload
+	leaq	(%rax,%r12), %rdx
+	movq	%r14, %rdi
+	movq	%rbx, %rsi
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	movq	%rbx, %rsi
+	jmp	.LBB192_75
+.LBB192_22:                             # %for.cond41.preheader
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movl	72(%rax), %ecx
+	testl	%ecx, %ecx
+	jle	.LBB192_77
+# %bb.23:                               # %for.body44.lr.ph
+                                        #   in Loop: Header=BB192_5 Depth=1
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movq	56(%rax), %rax
+	shlq	$3, %rcx
+	leaq	(%rcx,%rcx,8), %rcx
+	xorl	%edx, %edx
+	.p2align	4, 0x90
+.LBB192_25:                             # %for.body44
+                                        #   Parent Loop BB192_5 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	cmpq	$0, 32(%rax,%rdx)
+	jne	.LBB192_26
+# %bb.24:                               # %for.cond41
+                                        #   in Loop: Header=BB192_25 Depth=2
+	addq	$72, %rdx
+	cmpq	%rdx, %rcx
+	jne	.LBB192_25
+	jmp	.LBB192_77
+.LBB192_8:                              # %for.cond.cleanup
+	testq	%rbx, %rbx
+	je	.LBB192_9
+# %bb.10:                               # %if.else.i
+	subq	%rbx, %r12
+	incq	%r12
+	movq	-80(%rbp), %rdi                 # 8-byte Reload
+	movq	%rbx, %rsi
+	movq	%r12, %rdx
+	callq	halide_msan_annotate_memory_is_initialized@PLT
+	jmp	.LBB192_11
+.LBB192_9:                              # %if.then.i
+	leaq	.L.str.29.165(%rip), %rsi
+	movq	-80(%rbp), %rdi                 # 8-byte Reload
+	callq	halide_error@PLT
+.LBB192_11:                             # %_ZN6Halide7Runtime8Internal12_GLOBAL__N_17PrinterILNS1_11PrinterTypeE2ELy1024EED2Ev.exit
+	movq	%rbx, %rdi
+	addq	$72, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	jmp	free@PLT                        # TAILCALL
+.Lfunc_end192:
+	.size	halide_profiler_report_unlocked, .Lfunc_end192-halide_profiler_report_unlocked
+                                        # -- End function
+	.section	.text.halide_profiler_reset_unlocked,"ax",@progbits
+	.weak	halide_profiler_reset_unlocked  # -- Begin function halide_profiler_reset_unlocked
+	.p2align	4, 0x90
+	.type	halide_profiler_reset_unlocked,@function
+halide_profiler_reset_unlocked:         # @halide_profiler_reset_unlocked
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	movq	24(%rdi), %r14
+	testq	%r14, %r14
+	je	.LBB193_3
+	.p2align	4, 0x90
+.LBB193_1:                              # %while.body
+                                        # =>This Inner Loop Header: Depth=1
+	movq	64(%r14), %rax
+	movq	%rax, 24(%rbx)
+	movq	56(%r14), %rdi
+	callq	free@PLT
+	movq	%r14, %rdi
+	callq	free@PLT
+	movq	24(%rbx), %r14
+	testq	%r14, %r14
+	jne	.LBB193_1
+.LBB193_3:                              # %while.end
+	movl	$0, 12(%rbx)
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end193:
+	.size	halide_profiler_reset_unlocked, .Lfunc_end193-halide_profiler_reset_unlocked
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal23find_or_create_pipelineEPKciPKy,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal23find_or_create_pipelineEPKciPKy # -- Begin function _ZN6Halide7Runtime8Internal23find_or_create_pipelineEPKciPKy
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal23find_or_create_pipelineEPKciPKy,@function
+_ZN6Halide7Runtime8Internal23find_or_create_pipelineEPKciPKy: # @_ZN6Halide7Runtime8Internal23find_or_create_pipelineEPKciPKy
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rdx, %r15
+	movl	%esi, %r14d
+	movq	%rdi, %r13
+	callq	halide_profiler_get_state@PLT
+	movq	%rax, %r12
+	movq	24(%rax), %rbx
+	jmp	.LBB194_1
+	.p2align	4, 0x90
+.LBB194_4:                              # %for.inc
+                                        #   in Loop: Header=BB194_1 Depth=1
+	movq	64(%rbx), %rbx
+.LBB194_1:                              # %entry
+                                        # =>This Inner Loop Header: Depth=1
+	testq	%rbx, %rbx
+	je	.LBB194_5
+# %bb.2:                                # %for.body
+                                        #   in Loop: Header=BB194_1 Depth=1
+	cmpq	%r13, 48(%rbx)
+	jne	.LBB194_4
+# %bb.3:                                # %land.lhs.true
+                                        #   in Loop: Header=BB194_1 Depth=1
+	cmpl	%r14d, 72(%rbx)
+	jne	.LBB194_4
+	jmp	.LBB194_13
+.LBB194_5:                              # %for.end
+	movl	$96, %edi
+	callq	malloc@PLT
+	testq	%rax, %rax
+	je	.LBB194_12
+# %bb.6:                                # %if.end7
+	movq	%rax, %rbx
+	movq	24(%r12), %rax
+	movq	%rax, 64(%rbx)
+	movq	%r13, 48(%rbx)
+	movl	12(%r12), %eax
+	movl	%eax, 76(%rbx)
+	movl	%r14d, 72(%rbx)
+	movq	$0, 80(%rbx)
+	vxorps	%xmm0, %xmm0, %xmm0
+	vmovups	%ymm0, (%rbx)
+	movl	$0, 88(%rbx)
+	vxorps	%xmm0, %xmm0, %xmm0
+	vmovups	%xmm0, 32(%rbx)
+	movslq	%r14d, %rax
+	shlq	$3, %rax
+	leaq	(%rax,%rax,8), %rdi
+	vzeroupper
+	callq	malloc@PLT
+	movq	%rax, 56(%rbx)
+	testq	%rax, %rax
+	je	.LBB194_11
+# %bb.7:                                # %for.cond17.preheader
+	testl	%r14d, %r14d
+	jle	.LBB194_10
+# %bb.8:                                # %for.body20.preheader
+	movl	%r14d, %ecx
+	addq	$64, %rax
+	xorl	%edx, %edx
+	vxorps	%xmm0, %xmm0, %xmm0
+	vxorps	%xmm1, %xmm1, %xmm1
+	.p2align	4, 0x90
+.LBB194_9:                              # %for.body20
+                                        # =>This Inner Loop Header: Depth=1
+	movq	$0, -64(%rax)
+	movq	(%r15,%rdx,8), %rsi
+	movq	%rsi, -8(%rax)
+	movl	$0, (%rax)
+	vmovups	%ymm0, -56(%rax)
+	vmovups	%xmm1, -24(%rax)
+	incq	%rdx
+	addq	$72, %rax
+	cmpq	%rdx, %rcx
+	jne	.LBB194_9
+.LBB194_10:                             # %for.cond.cleanup19
+	addl	%r14d, 12(%r12)
+	movq	%rbx, 24(%r12)
+	jmp	.LBB194_13
+.LBB194_11:                             # %if.then15
+	movq	%rbx, %rdi
+	callq	free@PLT
+.LBB194_12:                             # %cleanup62
+	xorl	%ebx, %ebx
+.LBB194_13:                             # %cleanup62
+	movq	%rbx, %rax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	vzeroupper
+	retq
+.Lfunc_end194:
+	.size	_ZN6Halide7Runtime8Internal23find_or_create_pipelineEPKciPKy, .Lfunc_end194-_ZN6Halide7Runtime8Internal23find_or_create_pipelineEPKciPKy
+                                        # -- End function
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4, 0x0                          # -- Begin function _ZN6Halide7Runtime8Internal9bill_funcEP21halide_profiler_stateiyi
+.LCPI195_0:
+	.zero	8
+	.quad	1                               # 0x1
+	.section	.text._ZN6Halide7Runtime8Internal9bill_funcEP21halide_profiler_stateiyi,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal9bill_funcEP21halide_profiler_stateiyi
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal9bill_funcEP21halide_profiler_stateiyi,@function
+_ZN6Halide7Runtime8Internal9bill_funcEP21halide_profiler_stateiyi: # @_ZN6Halide7Runtime8Internal9bill_funcEP21halide_profiler_stateiyi
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	24(%rdi), %r8
+	testq	%r8, %r8
+	je	.LBB195_8
+# %bb.1:                                # %for.body.preheader
+	xorl	%r10d, %r10d
+	movq	%r8, %r9
+	jmp	.LBB195_3
+	.p2align	4, 0x90
+.LBB195_2:                              # %if.end23
+                                        #   in Loop: Header=BB195_3 Depth=1
+	movq	64(%rax), %r9
+	movq	%rax, %r10
+	testq	%r9, %r9
+	je	.LBB195_8
+.LBB195_3:                              # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	movq	%r9, %rax
+	movslq	76(%r9), %r9
+	cmpl	%esi, %r9d
+	jg	.LBB195_2
+# %bb.4:                                # %land.lhs.true
+                                        #   in Loop: Header=BB195_3 Depth=1
+	movl	72(%rax), %r11d
+	addl	%r9d, %r11d
+	cmpl	%esi, %r11d
+	jle	.LBB195_2
+# %bb.5:                                # %if.then
+	testq	%r10, %r10
+	je	.LBB195_7
+# %bb.6:                                # %if.then4
+	movq	64(%rax), %r11
+	movq	%r11, 64(%r10)
+	movq	%r8, 64(%rax)
+	movq	%rax, 24(%rdi)
+.LBB195_7:                              # %if.end
+	movslq	%esi, %rsi
+	leaq	(%rsi,%rsi,8), %rsi
+	shlq	$3, %rsi
+	addq	56(%rax), %rsi
+	negq	%r9
+	leaq	(%r9,%r9,8), %rdi
+	addq	%rdx, (%rsi,%rdi,8)
+	movslq	%ecx, %rcx
+	vmovdqa	.LCPI195_0(%rip), %xmm0         # xmm0 = <u,1>
+	vpinsrq	$0, %rcx, %xmm0, %xmm0
+	vpaddq	40(%rsi,%rdi,8), %xmm0, %xmm1
+	vmovdqu	%xmm1, 40(%rsi,%rdi,8)
+	addq	%rdx, (%rax)
+	incl	84(%rax)
+	vpaddq	32(%rax), %xmm0, %xmm0
+	vmovdqu	%xmm0, 32(%rax)
+.LBB195_8:                              # %cleanup
+	popq	%rbp
+	retq
+.Lfunc_end195:
+	.size	_ZN6Halide7Runtime8Internal9bill_funcEP21halide_profiler_stateiyi, .Lfunc_end195-_ZN6Halide7Runtime8Internal9bill_funcEP21halide_profiler_stateiyi
+                                        # -- End function
+	.section	.text.halide_profiler_sample,"ax",@progbits
+	.weak	halide_profiler_sample          # -- Begin function halide_profiler_sample
+	.p2align	4, 0x90
+	.type	halide_profiler_sample,@function
+halide_profiler_sample:                 # @halide_profiler_sample
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movq	%rsi, %r14
+	movq	%rdi, %rbx
+	movq	32(%rdi), %rax
+	testq	%rax, %rax
+	je	.LBB196_2
+# %bb.1:                                # %if.then
+	leaq	-32(%rbp), %rdi
+	leaq	-28(%rbp), %rsi
+	callq	*%rax
+	jmp	.LBB196_3
+.LBB196_2:                              # %if.else
+	movl	16(%rbx), %eax
+	movl	%eax, -32(%rbp)
+	movl	20(%rbx), %eax
+	movl	%eax, -28(%rbp)
+.LBB196_3:                              # %if.end
+	xorl	%edi, %edi
+	callq	halide_current_time_ns@PLT
+	movq	%rax, %r15
+	movl	-32(%rbp), %esi
+	movl	$-1, %eax
+	cmpl	$-2, %esi
+	je	.LBB196_7
+# %bb.4:                                # %if.else4
+	testl	%esi, %esi
+	js	.LBB196_6
+# %bb.5:                                # %if.then6
+	movq	%r15, %rdx
+	subq	(%r14), %rdx
+	movl	-28(%rbp), %ecx
+	movq	%rbx, %rdi
+	callq	_ZN6Halide7Runtime8Internal9bill_funcEP21halide_profiler_stateiyi@PLT
+.LBB196_6:                              # %if.end8
+	movq	%r15, (%r14)
+	movl	8(%rbx), %eax
+.LBB196_7:                              # %cleanup
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end196:
+	.size	halide_profiler_sample, .Lfunc_end196-halide_profiler_sample
+                                        # -- End function
+	.section	.text._ZN6Halide7Runtime8Internal24sampling_profiler_threadEPv,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal24sampling_profiler_threadEPv # -- Begin function _ZN6Halide7Runtime8Internal24sampling_profiler_threadEPv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal24sampling_profiler_threadEPv,@function
+_ZN6Halide7Runtime8Internal24sampling_profiler_threadEPv: # @_ZN6Halide7Runtime8Internal24sampling_profiler_threadEPv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	callq	halide_profiler_get_state@PLT
+	movq	%rax, %rbx
+	movq	%rax, %rdi
+	callq	halide_mutex_lock@PLT
+	cmpl	$-2, 16(%rbx)
+	jne	.LBB197_1
+.LBB197_6:                              # %while.end8
+	movq	%rbx, %rdi
+	callq	halide_mutex_unlock@PLT
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB197_1:                              # %while.body.preheader
+	leaq	-32(%rbp), %r14
+	jmp	.LBB197_2
+	.p2align	4, 0x90
+.LBB197_5:                              # %while.end
+                                        #   in Loop: Header=BB197_2 Depth=1
+	cmpl	$-2, 16(%rbx)
+	je	.LBB197_6
+.LBB197_2:                              # %while.body
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB197_4 Depth 2
+	xorl	%edi, %edi
+	callq	halide_current_time_ns@PLT
+	movq	%rax, -32(%rbp)
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_profiler_sample@PLT
+	testl	%eax, %eax
+	js	.LBB197_5
+# %bb.3:                                # %cleanup.preheader
+                                        #   in Loop: Header=BB197_2 Depth=1
+	movl	%eax, %r15d
+	.p2align	4, 0x90
+.LBB197_4:                              # %cleanup
+                                        #   Parent Loop BB197_2 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movq	%rbx, %rdi
+	callq	halide_mutex_unlock@PLT
+	xorl	%edi, %edi
+	movl	%r15d, %esi
+	callq	halide_sleep_ms@PLT
+	movq	%rbx, %rdi
+	callq	halide_mutex_lock@PLT
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_profiler_sample@PLT
+	movl	%eax, %r15d
+	testl	%eax, %eax
+	jns	.LBB197_4
+	jmp	.LBB197_5
+.Lfunc_end197:
+	.size	_ZN6Halide7Runtime8Internal24sampling_profiler_threadEPv, .Lfunc_end197-_ZN6Halide7Runtime8Internal24sampling_profiler_threadEPv
+                                        # -- End function
+	.section	.text.halide_profiler_get_pipeline_state,"ax",@progbits
+	.weak	halide_profiler_get_pipeline_state # -- Begin function halide_profiler_get_pipeline_state
+	.p2align	4, 0x90
+	.type	halide_profiler_get_pipeline_state,@function
+halide_profiler_get_pipeline_state:     # @halide_profiler_get_pipeline_state
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%rbx
+	pushq	%rax
+	movq	%rdi, %r15
+	callq	halide_profiler_get_state@PLT
+	movq	%rax, %rbx
+	movq	%rax, %rdi
+	callq	halide_mutex_lock@PLT
+	movq	24(%rbx), %r14
+	testq	%r14, %r14
+	je	.LBB198_4
+	.p2align	4, 0x90
+.LBB198_2:                              # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	cmpq	%r15, 48(%r14)
+	je	.LBB198_5
+# %bb.3:                                # %for.inc
+                                        #   in Loop: Header=BB198_2 Depth=1
+	movq	64(%r14), %r14
+	testq	%r14, %r14
+	jne	.LBB198_2
+.LBB198_4:
+	xorl	%r14d, %r14d
+.LBB198_5:                              # %cleanup
+	movq	%rbx, %rdi
+	callq	halide_mutex_unlock@PLT
+	movq	%r14, %rax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end198:
+	.size	halide_profiler_get_pipeline_state, .Lfunc_end198-halide_profiler_get_pipeline_state
+                                        # -- End function
+	.section	.text.halide_profiler_pipeline_start,"ax",@progbits
+	.weak	halide_profiler_pipeline_start  # -- Begin function halide_profiler_pipeline_start
+	.p2align	4, 0x90
+	.type	halide_profiler_pipeline_start,@function
+halide_profiler_pipeline_start:         # @halide_profiler_pipeline_start
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movq	%rcx, %r15
+	movl	%edx, %r12d
+	movq	%rsi, %r13
+	movq	%rdi, %r14
+	callq	halide_profiler_get_state@PLT
+	movq	%rax, %rbx
+	movq	%rax, %rdi
+	callq	halide_mutex_lock@PLT
+	cmpq	$0, 40(%rbx)
+	jne	.LBB199_2
+# %bb.1:                                # %if.then
+	movq	%r14, %rdi
+	callq	halide_start_clock@PLT
+	movq	_ZN6Halide7Runtime8Internal24sampling_profiler_threadEPv@GOTPCREL(%rip), %rdi
+	xorl	%esi, %esi
+	callq	halide_spawn_thread@PLT
+	movq	%rax, 40(%rbx)
+.LBB199_2:                              # %if.end
+	movq	%r13, %rdi
+	movl	%r12d, %esi
+	movq	%r15, %rdx
+	callq	_ZN6Halide7Runtime8Internal23find_or_create_pipelineEPKciPKy@PLT
+	testq	%rax, %rax
+	je	.LBB199_3
+# %bb.4:                                # %if.end8
+	incl	80(%rax)
+	movl	76(%rax), %r14d
+	jmp	.LBB199_5
+.LBB199_3:                              # %if.then6
+	movq	%r14, %rdi
+	callq	halide_error_out_of_memory@PLT
+	movl	%eax, %r14d
+.LBB199_5:                              # %cleanup
+	movq	%rbx, %rdi
+	callq	halide_mutex_unlock@PLT
+	movl	%r14d, %eax
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end199:
+	.size	halide_profiler_pipeline_start, .Lfunc_end199-halide_profiler_pipeline_start
+                                        # -- End function
+	.section	.text.halide_profiler_stack_peak_update,"ax",@progbits
+	.weak	halide_profiler_stack_peak_update # -- Begin function halide_profiler_stack_peak_update
+	.p2align	4, 0x90
+	.type	halide_profiler_stack_peak_update,@function
+halide_profiler_stack_peak_update:      # @halide_profiler_stack_peak_update
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdx, %rbx
+	movq	%rsi, %r14
+	testq	%rsi, %rsi
+	je	.LBB200_1
+# %bb.2:                                # %do.end
+	movl	72(%r14), %edx
+	testl	%edx, %edx
+	jg	.LBB200_3
+	jmp	.LBB200_11
+.LBB200_1:                              # %if.then
+	leaq	.L.str.188(%rip), %rsi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	movl	72(%r14), %edx
+	testl	%edx, %edx
+	jle	.LBB200_11
+.LBB200_3:                              # %for.body.lr.ph
+	xorl	%ecx, %ecx
+	jmp	.LBB200_4
+	.p2align	4, 0x90
+.LBB200_9:                              # %for.inc.loopexit
+                                        #   in Loop: Header=BB200_4 Depth=1
+	movl	72(%r14), %edx
+.LBB200_10:                             # %for.inc
+                                        #   in Loop: Header=BB200_4 Depth=1
+	incq	%rcx
+	movslq	%edx, %rax
+	cmpq	%rax, %rcx
+	jge	.LBB200_11
+.LBB200_4:                              # %for.body
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB200_7 Depth 2
+	movq	(%rbx,%rcx,8), %rsi
+	testq	%rsi, %rsi
+	je	.LBB200_10
+# %bb.5:                                # %if.then3
+                                        #   in Loop: Header=BB200_4 Depth=1
+	movq	56(%r14), %rdi
+	leaq	(%rcx,%rcx,8), %r8
+	movq	32(%rdi,%r8,8), %rax
+	cmpq	%rsi, %rax
+	jae	.LBB200_10
+# %bb.6:                                # %while.body.i.preheader
+                                        #   in Loop: Header=BB200_4 Depth=1
+	leaq	(%rdi,%r8,8), %rdx
+	addq	$32, %rdx
+	.p2align	4, 0x90
+.LBB200_7:                              # %while.body.i
+                                        #   Parent Loop BB200_4 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	lock		cmpxchgq	%rsi, (%rdx)
+	je	.LBB200_9
+# %bb.8:                                # %while.body.i
+                                        #   in Loop: Header=BB200_7 Depth=2
+	cmpq	%rsi, %rax
+	jb	.LBB200_7
+	jmp	.LBB200_9
+.LBB200_11:                             # %for.cond.cleanup
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	retq
+.Lfunc_end200:
+	.size	halide_profiler_stack_peak_update, .Lfunc_end200-halide_profiler_stack_peak_update
+                                        # -- End function
+	.section	.text.halide_profiler_memory_allocate,"ax",@progbits
+	.weak	halide_profiler_memory_allocate # -- Begin function halide_profiler_memory_allocate
+	.p2align	4, 0x90
+	.type	halide_profiler_memory_allocate,@function
+halide_profiler_memory_allocate:        # @halide_profiler_memory_allocate
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	testq	%rcx, %rcx
+	je	.LBB201_14
+# %bb.1:                                # %if.end
+	movq	%rcx, %rbx
+	movl	%edx, %r15d
+	movq	%rsi, %r14
+	movq	%rdi, %r12
+	testq	%rsi, %rsi
+	je	.LBB201_2
+# %bb.3:                                # %do.body4
+	testl	%r15d, %r15d
+	js	.LBB201_4
+.LBB201_5:                              # %do.body10
+	cmpl	%r15d, 72(%r14)
+	jg	.LBB201_7
+.LBB201_6:                              # %if.then12
+	leaq	.L.str.3.191(%rip), %rsi
+	movq	%r12, %rdi
+	callq	halide_print@PLT
+	callq	abort@PLT
+.LBB201_7:                              # %do.end15
+	movq	56(%r14), %rcx
+	lock		incl	88(%r14)
+	lock		addq	%rbx, 24(%r14)
+	movq	%rbx, %rdx
+	lock		xaddq	%rdx, 8(%r14)
+	movslq	%r15d, %rsi
+	addq	%rbx, %rdx
+	movq	16(%r14), %rax
+	.p2align	4, 0x90
+.LBB201_8:                              # %do.end15
+                                        # =>This Inner Loop Header: Depth=1
+	cmpq	%rdx, %rax
+	jae	.LBB201_10
+# %bb.9:                                # %while.body.i
+                                        #   in Loop: Header=BB201_8 Depth=1
+	lock		cmpxchgq	%rdx, 16(%r14)
+	jne	.LBB201_8
+.LBB201_10:                             # %_ZN12_GLOBAL__N_125sync_compare_max_and_swapIyEEvPT_S1_.exit
+	leaq	(%rsi,%rsi,8), %rsi
+	lock		incl	64(%rcx,%rsi,8)
+	lock		addq	%rbx, 24(%rcx,%rsi,8)
+	movq	%rbx, %rdx
+	lock		xaddq	%rdx, 8(%rcx,%rsi,8)
+	addq	%rbx, %rdx
+	movq	16(%rcx,%rsi,8), %rax
+	cmpq	%rdx, %rax
+	jae	.LBB201_14
+# %bb.11:                               # %while.body.i46.preheader
+	leaq	(%rcx,%rsi,8), %rcx
+	addq	$16, %rcx
+	.p2align	4, 0x90
+.LBB201_12:                             # %while.body.i46
+                                        # =>This Inner Loop Header: Depth=1
+	lock		cmpxchgq	%rdx, (%rcx)
+	je	.LBB201_14
+# %bb.13:                               # %while.body.i46
+                                        #   in Loop: Header=BB201_12 Depth=1
+	cmpq	%rdx, %rax
+	jb	.LBB201_12
+.LBB201_14:                             # %return
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB201_2:                              # %if.then2
+	leaq	.L.str.1.189(%rip), %rsi
+	movq	%r12, %rdi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	testl	%r15d, %r15d
+	jns	.LBB201_5
+.LBB201_4:                              # %if.then6
+	leaq	.L.str.2.190(%rip), %rsi
+	movq	%r12, %rdi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	cmpl	%r15d, 72(%r14)
+	jle	.LBB201_6
+	jmp	.LBB201_7
+.Lfunc_end201:
+	.size	halide_profiler_memory_allocate, .Lfunc_end201-halide_profiler_memory_allocate
+                                        # -- End function
+	.section	.text.halide_profiler_memory_free,"ax",@progbits
+	.weak	halide_profiler_memory_free     # -- Begin function halide_profiler_memory_free
+	.p2align	4, 0x90
+	.type	halide_profiler_memory_free,@function
+halide_profiler_memory_free:            # @halide_profiler_memory_free
+# %bb.0:                                # %entry
+	testq	%rcx, %rcx
+	je	.LBB202_8
+# %bb.1:                                # %if.end
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	movq	%rcx, %rbx
+	movl	%edx, %r15d
+	movq	%rsi, %r14
+	movq	%rdi, %r12
+	testq	%rsi, %rsi
+	je	.LBB202_2
+# %bb.3:                                # %do.body4
+	testl	%r15d, %r15d
+	js	.LBB202_4
+.LBB202_5:                              # %do.body10
+	cmpl	%r15d, 72(%r14)
+	jg	.LBB202_7
+.LBB202_6:                              # %if.then12
+	leaq	.L.str.6.194(%rip), %rsi
+	movq	%r12, %rdi
+	callq	halide_print@PLT
+	callq	abort@PLT
+.LBB202_7:                              # %do.end15
+	movq	56(%r14), %rax
+	movslq	%r15d, %rcx
+	lock		subq	%rbx, 8(%r14)
+	leaq	(%rcx,%rcx,8), %rcx
+	lock		subq	%rbx, 8(%rax,%rcx,8)
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+.LBB202_8:                              # %return
+	retq
+.LBB202_2:                              # %if.then2
+	leaq	.L.str.4.192(%rip), %rsi
+	movq	%r12, %rdi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	testl	%r15d, %r15d
+	jns	.LBB202_5
+.LBB202_4:                              # %if.then6
+	leaq	.L.str.5.193(%rip), %rsi
+	movq	%r12, %rdi
+	callq	halide_print@PLT
+	callq	abort@PLT
+	cmpl	%r15d, 72(%r14)
+	jle	.LBB202_6
+	jmp	.LBB202_7
+.Lfunc_end202:
+	.size	halide_profiler_memory_free, .Lfunc_end202-halide_profiler_memory_free
+                                        # -- End function
+	.section	.text.halide_profiler_report,"ax",@progbits
+	.weak	halide_profiler_report          # -- Begin function halide_profiler_report
+	.p2align	4, 0x90
+	.type	halide_profiler_report,@function
+halide_profiler_report:                 # @halide_profiler_report
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r14
+	pushq	%rbx
+	movq	%rdi, %rbx
+	callq	halide_profiler_get_state@PLT
+	movq	%rax, %r14
+	movq	%rax, %rdi
+	callq	halide_mutex_lock@PLT
+	movq	%rbx, %rdi
+	movq	%r14, %rsi
+	callq	halide_profiler_report_unlocked@PLT
+	movq	%r14, %rdi
+	popq	%rbx
+	popq	%r14
+	popq	%rbp
+	jmp	halide_mutex_unlock@PLT         # TAILCALL
+.Lfunc_end203:
+	.size	halide_profiler_report, .Lfunc_end203-halide_profiler_report
+                                        # -- End function
+	.section	.text.halide_profiler_reset,"ax",@progbits
+	.weak	halide_profiler_reset           # -- Begin function halide_profiler_reset
+	.p2align	4, 0x90
+	.type	halide_profiler_reset,@function
+halide_profiler_reset:                  # @halide_profiler_reset
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%rbx
+	pushq	%rax
+	callq	halide_profiler_get_state@PLT
+	movq	%rax, %rbx
+	movq	%rax, %rdi
+	callq	halide_mutex_lock@PLT
+	movq	%rbx, %rdi
+	callq	halide_profiler_reset_unlocked@PLT
+	movq	%rbx, %rdi
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%rbp
+	jmp	halide_mutex_unlock@PLT         # TAILCALL
+.Lfunc_end204:
+	.size	halide_profiler_reset, .Lfunc_end204-halide_profiler_reset
+                                        # -- End function
+	.section	.text.halide_profiler_pipeline_end,"ax",@progbits
+	.weak	halide_profiler_pipeline_end    # -- Begin function halide_profiler_pipeline_end
+	.p2align	4, 0x90
+	.type	halide_profiler_pipeline_end,@function
+halide_profiler_pipeline_end:           # @halide_profiler_pipeline_end
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movl	$-1, 16(%rsi)
+	popq	%rbp
+	retq
+.Lfunc_end205:
+	.size	halide_profiler_pipeline_end, .Lfunc_end205-halide_profiler_pipeline_end
+                                        # -- End function
+	.section	.text.halide_msan_annotate_memory_is_initialized,"ax",@progbits
+	.weak	halide_msan_annotate_memory_is_initialized # -- Begin function halide_msan_annotate_memory_is_initialized
+	.p2align	4, 0x90
+	.type	halide_msan_annotate_memory_is_initialized,@function
+halide_msan_annotate_memory_is_initialized: # @halide_msan_annotate_memory_is_initialized
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	xorl	%eax, %eax
+	popq	%rbp
+	retq
+.Lfunc_end206:
+	.size	halide_msan_annotate_memory_is_initialized, .Lfunc_end206-halide_msan_annotate_memory_is_initialized
+                                        # -- End function
+	.section	.text.halide_msan_check_memory_is_initialized,"ax",@progbits
+	.weak	halide_msan_check_memory_is_initialized # -- Begin function halide_msan_check_memory_is_initialized
+	.p2align	4, 0x90
+	.type	halide_msan_check_memory_is_initialized,@function
+halide_msan_check_memory_is_initialized: # @halide_msan_check_memory_is_initialized
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	xorl	%eax, %eax
+	popq	%rbp
+	retq
+.Lfunc_end207:
+	.size	halide_msan_check_memory_is_initialized, .Lfunc_end207-halide_msan_check_memory_is_initialized
+                                        # -- End function
+	.section	.text.halide_msan_check_buffer_is_initialized,"ax",@progbits
+	.weak	halide_msan_check_buffer_is_initialized # -- Begin function halide_msan_check_buffer_is_initialized
+	.p2align	4, 0x90
+	.type	halide_msan_check_buffer_is_initialized,@function
+halide_msan_check_buffer_is_initialized: # @halide_msan_check_buffer_is_initialized
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	xorl	%eax, %eax
+	popq	%rbp
+	retq
+.Lfunc_end208:
+	.size	halide_msan_check_buffer_is_initialized, .Lfunc_end208-halide_msan_check_buffer_is_initialized
+                                        # -- End function
+	.section	.text.halide_msan_annotate_buffer_is_initialized,"ax",@progbits
+	.weak	halide_msan_annotate_buffer_is_initialized # -- Begin function halide_msan_annotate_buffer_is_initialized
+	.p2align	4, 0x90
+	.type	halide_msan_annotate_buffer_is_initialized,@function
+halide_msan_annotate_buffer_is_initialized: # @halide_msan_annotate_buffer_is_initialized
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	xorl	%eax, %eax
+	popq	%rbp
+	retq
+.Lfunc_end209:
+	.size	halide_msan_annotate_buffer_is_initialized, .Lfunc_end209-halide_msan_annotate_buffer_is_initialized
+                                        # -- End function
+	.section	.text.halide_msan_annotate_buffer_is_initialized_as_destructor,"ax",@progbits
+	.weak	halide_msan_annotate_buffer_is_initialized_as_destructor # -- Begin function halide_msan_annotate_buffer_is_initialized_as_destructor
+	.p2align	4, 0x90
+	.type	halide_msan_annotate_buffer_is_initialized_as_destructor,@function
+halide_msan_annotate_buffer_is_initialized_as_destructor: # @halide_msan_annotate_buffer_is_initialized_as_destructor
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	retq
+.Lfunc_end210:
+	.size	halide_msan_annotate_buffer_is_initialized_as_destructor, .Lfunc_end210-halide_msan_annotate_buffer_is_initialized_as_destructor
+                                        # -- End function
+	.section	.text.halide_default_can_use_target_features,"ax",@progbits
+	.weak	halide_default_can_use_target_features # -- Begin function halide_default_can_use_target_features
+	.p2align	4, 0x90
+	.type	halide_default_can_use_target_features,@function
+halide_default_can_use_target_features: # @halide_default_can_use_target_features
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r12
+	pushq	%rbx
+	subq	$32, %rsp
+	movq	%rsi, %rbx
+	movl	%edi, %r14d
+	movq	_ZN6Halide7Runtime8Internal36halide_cpu_features_initialized_lockE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_lock@PLT
+	movq	_ZN6Halide7Runtime8Internal31halide_cpu_features_initializedE@GOTPCREL(%rip), %r12
+	cmpb	$0, (%r12)
+	je	.LBB211_1
+# %bb.2:                                # %if.end
+	movq	_ZN6Halide7Runtime8Internal36halide_cpu_features_initialized_lockE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_unlock@PLT
+	cmpl	$2, %r14d
+	jne	.LBB211_3
+.LBB211_4:                              # %if.end2
+	movq	_ZN6Halide7Runtime8Internal27halide_cpu_features_storageE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rdx
+	andq	(%rbx), %rdx
+	jne	.LBB211_5
+	jmp	.LBB211_6
+.LBB211_1:                              # %if.then
+	leaq	-64(%rbp), %r15
+	movq	%r15, %rdi
+	callq	_ZN6Halide7Runtime8Internal23halide_get_cpu_featuresEv@PLT
+	movq	_ZN6Halide7Runtime8Internal27halide_cpu_features_storageE@GOTPCREL(%rip), %rdi
+	movl	$32, %edx
+	movq	%r15, %rsi
+	callq	memcpy@PLT
+	movb	$1, (%r12)
+	movq	_ZN6Halide7Runtime8Internal36halide_cpu_features_initialized_lockE@GOTPCREL(%rip), %rdi
+	callq	halide_mutex_unlock@PLT
+	cmpl	$2, %r14d
+	je	.LBB211_4
+.LBB211_3:                              # %if.then1
+	leaq	.L.str.199(%rip), %rsi
+	xorl	%edi, %edi
+	callq	halide_error@PLT
+	movq	_ZN6Halide7Runtime8Internal27halide_cpu_features_storageE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rdx
+	andq	(%rbx), %rdx
+	je	.LBB211_6
+.LBB211_5:                              # %if.then7
+	movq	16(%rcx), %rsi
+	xorl	%eax, %eax
+	andnq	%rdx, %rsi, %rdx
+	jne	.LBB211_9
+.LBB211_6:                              # %if.end14
+	movq	8(%rcx), %rdx
+	andq	8(%rbx), %rdx
+	je	.LBB211_8
+# %bb.7:                                # %if.then7.1
+	movq	24(%rcx), %rcx
+	xorl	%eax, %eax
+	andnq	%rdx, %rcx, %rcx
+	jne	.LBB211_9
+.LBB211_8:                              # %if.end14.1
+	movl	$1, %eax
+.LBB211_9:                              # %cleanup15
+	addq	$32, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end211:
+	.size	halide_default_can_use_target_features, .Lfunc_end211-halide_default_can_use_target_features
+                                        # -- End function
+	.section	.text.halide_set_custom_can_use_target_features,"ax",@progbits
+	.weak	halide_set_custom_can_use_target_features # -- Begin function halide_set_custom_can_use_target_features
+	.p2align	4, 0x90
+	.type	halide_set_custom_can_use_target_features,@function
+halide_set_custom_can_use_target_features: # @halide_set_custom_can_use_target_features
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal30custom_can_use_target_featuresE@GOTPCREL(%rip), %rcx
+	movq	(%rcx), %rax
+	movq	%rdi, (%rcx)
+	popq	%rbp
+	retq
+.Lfunc_end212:
+	.size	halide_set_custom_can_use_target_features, .Lfunc_end212-halide_set_custom_can_use_target_features
+                                        # -- End function
+	.section	.text.halide_can_use_target_features,"ax",@progbits
+	.weak	halide_can_use_target_features  # -- Begin function halide_can_use_target_features
+	.p2align	4, 0x90
+	.type	halide_can_use_target_features,@function
+halide_can_use_target_features:         # @halide_can_use_target_features
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	_ZN6Halide7Runtime8Internal30custom_can_use_target_featuresE@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+	popq	%rbp
+	jmpq	*%rax                           # TAILCALL
+.Lfunc_end213:
+	.size	halide_can_use_target_features, .Lfunc_end213-halide_can_use_target_features
+                                        # -- End function
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4, 0x0                          # -- Begin function _ZN6Halide7Runtime8Internal23halide_get_cpu_featuresEv
+.LCPI214_0:
+	.quad	17042430231280                  # 0xf80000002f0
+	.quad	0                               # 0x0
+	.section	.text._ZN6Halide7Runtime8Internal23halide_get_cpu_featuresEv,"ax",@progbits
+	.weak	_ZN6Halide7Runtime8Internal23halide_get_cpu_featuresEv
+	.p2align	4, 0x90
+	.type	_ZN6Halide7Runtime8Internal23halide_get_cpu_featuresEv,@function
+_ZN6Halide7Runtime8Internal23halide_get_cpu_featuresEv: # @_ZN6Halide7Runtime8Internal23halide_get_cpu_featuresEv
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%rbx
+	vmovaps	.LCPI214_0(%rip), %xmm0         # xmm0 = [17042430231280,0]
+	vmovups	%ymm0, (%rdi)
+	movq	$1, -56(%rbp)
+	#APP
+
+	xchgq	%rsi, %rbx
+	movl	-56(%rbp), %eax
+	movl	-52(%rbp), %ecx
+	cpuid
+	movl	%eax, -56(%rbp)
+	movl	%ebx, -52(%rbp)
+	movl	%ecx, -48(%rbp)
+	movl	%edx, -44(%rbp)
+	xchgq	%rsi, %rbx
+
+	#NO_APP
+	movl	-48(%rbp), %eax
+	movq	%rax, %rcx
+	shrq	$15, %rcx
+	andl	$16, %ecx
+	movq	%rax, %rdx
+	shrq	$23, %rdx
+	andl	$32, %edx
+	orq	%rcx, %rdx
+	movq	%rax, %rcx
+	shrq	$20, %rcx
+	andl	$512, %ecx                      # imm = 0x200
+	orq	%rdx, %rcx
+	movq	%rax, %r8
+	shrq	$5, %r8
+	andl	$128, %r8d
+	orq	%rcx, %r8
+	testl	$805834752, %eax                # imm = 0x30081000
+	je	.LBB214_2
+# %bb.1:
+	movq	%r8, 16(%rdi)
+.LBB214_2:
+	notl	%eax
+	testl	$1879048192, %eax               # imm = 0x70000000
+	jne	.LBB214_10
+# %bb.3:                                # %if.then30
+	movq	$7, -24(%rbp)
+	#APP
+
+	xchgq	%rsi, %rbx
+	movl	-24(%rbp), %eax
+	movl	-20(%rbp), %ecx
+	cpuid
+	movl	%eax, -24(%rbp)
+	movl	%ebx, -20(%rbp)
+	movl	%ecx, -16(%rbp)
+	movl	%edx, -12(%rbp)
+	xchgq	%rsi, %rbx
+
+	#NO_APP
+	movl	-20(%rbp), %eax
+	testb	$32, %al
+	je	.LBB214_5
+# %bb.4:                                # %if.then35
+	orq	$64, %r8
+	movq	%r8, 16(%rdi)
+.LBB214_5:                              # %if.end36
+	notl	%eax
+	testl	$268500992, %eax                # imm = 0x10010000
+	jne	.LBB214_10
+# %bb.6:                                # %if.then40
+	xorl	%ecx, %ecx
+	testl	$469827584, %eax                # imm = 0x1C010000
+	sete	%cl
+	shlq	$40, %rcx
+	orq	%r8, %rcx
+	xorl	%edx, %edx
+	testl	$-805109760, %eax               # imm = 0xD0030000
+	sete	%dl
+	shlq	$41, %rdx
+	orq	%rcx, %rdx
+	movabsq	$549755813888, %r8              # imm = 0x8000000000
+	orq	%rdx, %r8
+	movq	%r8, 16(%rdi)
+	testl	$-803012608, %eax               # imm = 0xD0230000
+	jne	.LBB214_10
+# %bb.7:                                # %if.then54
+	movabsq	$4398046511104, %rax            # imm = 0x40000000000
+	orq	%r8, %rax
+	movq	%rax, 16(%rdi)
+	movabsq	$4294967303, %rax               # imm = 0x100000007
+	movq	%rax, -40(%rbp)
+	#APP
+
+	xchgq	%rsi, %rbx
+	movl	-40(%rbp), %eax
+	movl	-36(%rbp), %ecx
+	cpuid
+	movl	%eax, -40(%rbp)
+	movl	%ebx, -36(%rbp)
+	movl	%ecx, -32(%rbp)
+	movl	%edx, -28(%rbp)
+	xchgq	%rsi, %rbx
+
+	#NO_APP
+	testb	$8, -15(%rbp)
+	je	.LBB214_10
+# %bb.8:                                # %land.lhs.true59
+	testb	$32, -40(%rbp)
+	je	.LBB214_10
+# %bb.9:                                # %if.then63
+	movabsq	$13194139533312, %rax           # imm = 0xC0000000000
+	orq	%rax, %r8
+	movq	%r8, 16(%rdi)
+.LBB214_10:                             # %if.end67
+	movq	%rdi, %rax
+	popq	%rbx
+	popq	%rbp
+	vzeroupper
+	retq
+.Lfunc_end214:
+	.size	_ZN6Halide7Runtime8Internal23halide_get_cpu_featuresEv, .Lfunc_end214-_ZN6Halide7Runtime8Internal23halide_get_cpu_featuresEv
+                                        # -- End function
+	.section	.text.halide_use_jit_module,"ax",@progbits
+	.weak	halide_use_jit_module           # -- Begin function halide_use_jit_module
+	.p2align	4, 0x90
+	.type	halide_use_jit_module,@function
+halide_use_jit_module:                  # @halide_use_jit_module
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	retq
+.Lfunc_end215:
+	.size	halide_use_jit_module, .Lfunc_end215-halide_use_jit_module
+                                        # -- End function
+	.section	.text.halide_release_jit_module,"ax",@progbits
+	.weak	halide_release_jit_module       # -- Begin function halide_release_jit_module
+	.p2align	4, 0x90
+	.type	halide_release_jit_module,@function
+halide_release_jit_module:              # @halide_release_jit_module
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+	retq
+.Lfunc_end216:
+	.size	halide_release_jit_module, .Lfunc_end216-halide_release_jit_module
+                                        # -- End function
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4, 0x0                          # -- Begin function conv_y_par_for_conv_y_s0_x_xo_tile
+.LCPI217_0:
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	2                               # 0x2
+	.long	3                               # 0x3
+	.section	.text.conv_y_par_for_conv_y_s0_x_xo_tile,"ax",@progbits
+	.globl	conv_y_par_for_conv_y_s0_x_xo_tile
+	.p2align	4, 0x90
+	.type	conv_y_par_for_conv_y_s0_x_xo_tile,@function
+conv_y_par_for_conv_y_s0_x_xo_tile:     # @conv_y_par_for_conv_y_s0_x_xo_tile
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	andq	$-32, %rsp
+	subq	$384, %rsp                      # imm = 0x180
+	movq	%rdx, %rcx
+	movl	32(%rdx), %r8d
+	movl	%esi, %edi
+	sarl	$31, %edi
+	xorl	%r11d, %r11d
+	testl	%r8d, %r8d
+	sete	%r11b
+	movl	%r8d, %r9d
+	sarl	$31, %r9d
+	subl	%edi, %esi
+	movl	%r9d, %ebx
+	notl	%ebx
+	leal	(%r8,%r11), %r10d
+	movl	%esi, %eax
+	cltd
+	idivl	%r10d
+	movl	%ebx, %r10d
+	subl	%r9d, %r10d
+	andl	%edi, %r10d
+	addl	%eax, %r10d
+	shll	$6, %r10d
+	testl	%r8d, %r8d
+	cmovel	%r8d, %r10d
+	negl	%r11d
+	orl	%r8d, %r11d
+	movl	%esi, %eax
+	cltd
+	idivl	%r11d
+	xorl	%r8d, %r9d
+	addl	%ebx, %r9d
+	andl	%edi, %r9d
+	addl	%edx, %r9d
+	shll	$6, %r9d
+	testl	%r8d, %r8d
+	movq	(%rcx), %rax
+	movq	8(%rcx), %rdx
+	movl	16(%rcx), %edi
+	movl	20(%rcx), %esi
+	movl	24(%rcx), %r11d
+	movl	%r11d, 28(%rsp)                 # 4-byte Spill
+	cmovel	%r8d, %r9d
+	addl	$-64, %edi
+	cmpl	%r9d, %edi
+	cmovgel	%r9d, %edi
+	addl	$-64, %esi
+	cmpl	%r10d, %esi
+	cmovgel	%r10d, %esi
+	movl	40(%rcx), %r8d
+	addl	%edi, %r8d
+	addl	36(%rcx), %edi
+	movl	28(%rcx), %ecx
+	movl	%ecx, 24(%rsp)                  # 4-byte Spill
+	leal	(%rsi,%rdi), %r9d
+	addl	$-20, %r9d
+	xorl	%r10d, %r10d
+	vmovdqa	.LCPI217_0(%rip), %xmm0         # xmm0 = [0,1,2,3]
+	.p2align	4, 0x90
+.LBB217_1:                              # %"1_for_conv_y.s0.y.yi"
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB217_2 Depth 2
+                                        #       Child Loop BB217_3 Depth 3
+                                        #         Child Loop BB217_4 Depth 4
+                                        #       Child Loop BB217_7 Depth 3
+	leal	(%rsi,%r10), %edi
+	addl	28(%rsp), %edi                  # 4-byte Folded Reload
+	imull	24(%rsp), %edi                  # 4-byte Folded Reload
+	addl	%r8d, %edi
+	movslq	%edi, %r11
+	movq	%r9, %rbx
+	xorl	%r14d, %r14d
+	.p2align	4, 0x90
+.LBB217_2:                              # %"2_for_conv_y.s0.x.xi.xi"
+                                        #   Parent Loop BB217_1 Depth=1
+                                        # =>  This Loop Header: Depth=2
+                                        #       Child Loop BB217_3 Depth 3
+                                        #         Child Loop BB217_4 Depth 4
+                                        #       Child Loop BB217_7 Depth 3
+	leaq	(,%r14,4), %r15
+	movq	%rbx, %r12
+	xorl	%r13d, %r13d
+	.p2align	4, 0x90
+.LBB217_3:                              # %"3_for_conv_x.s0.y.rebased"
+                                        #   Parent Loop BB217_1 Depth=1
+                                        #     Parent Loop BB217_2 Depth=2
+                                        # =>    This Loop Header: Depth=3
+                                        #         Child Loop BB217_4 Depth 4
+	vxorps	%xmm1, %xmm1, %xmm1
+	xorl	%edi, %edi
+	.p2align	4, 0x90
+.LBB217_4:                              # %"4_for_conv_x$1.s1.k$x"
+                                        #   Parent Loop BB217_1 Depth=1
+                                        #     Parent Loop BB217_2 Depth=2
+                                        #       Parent Loop BB217_3 Depth=3
+                                        # =>      This Inner Loop Header: Depth=4
+	leal	(%r12,%rdi), %ecx
+	vmovd	%ecx, %xmm2
+	vpbroadcastd	%xmm2, %xmm2
+	vpaddd	%xmm0, %xmm2, %xmm2
+	vcvtdq2ps	%xmm2, %xmm2
+	vbroadcastss	(%rdx,%rdi,4), %xmm3
+	vfmadd231ps	%xmm3, %xmm2, %xmm1     # xmm1 = (xmm2 * xmm3) + xmm1
+	incq	%rdi
+	cmpq	$20, %rdi
+	jne	.LBB217_4
+# %bb.5:                                # %"3_consume_conv_x$1"
+                                        #   in Loop: Header=BB217_3 Depth=3
+	movq	%r13, %rcx
+	shlq	$4, %rcx
+	vmovaps	%xmm1, 32(%rsp,%rcx)
+	incq	%r13
+	incq	%r12
+	cmpq	$20, %r13
+	jne	.LBB217_3
+# %bb.6:                                # %"5_for_conv_y$1.s1.k$x.preheader"
+                                        #   in Loop: Header=BB217_2 Depth=2
+	vxorps	%xmm1, %xmm1, %xmm1
+	xorl	%edi, %edi
+	.p2align	4, 0x90
+.LBB217_7:                              # %"5_for_conv_y$1.s1.k$x"
+                                        #   Parent Loop BB217_1 Depth=1
+                                        #     Parent Loop BB217_2 Depth=2
+                                        # =>    This Inner Loop Header: Depth=3
+	vbroadcastss	(%rdx,%rdi), %xmm2
+	vfmadd231ps	32(%rsp,%rdi,4), %xmm2, %xmm1 # xmm1 = (xmm2 * mem) + xmm1
+	addq	$4, %rdi
+	cmpq	$80, %rdi
+	jne	.LBB217_7
+# %bb.8:                                # %"6_consume_conv_y$1"
+                                        #   in Loop: Header=BB217_2 Depth=2
+	addq	%r11, %r15
+	vmovups	%xmm1, (%rax,%r15,4)
+	incq	%r14
+	addq	$4, %rbx
+	cmpq	$16, %r14
+	jne	.LBB217_2
+# %bb.9:                                # %"2_end_for_conv_y.s0.x.xi.xi"
+                                        #   in Loop: Header=BB217_1 Depth=1
+	incq	%r10
+	incq	%r9
+	cmpq	$64, %r10
+	jne	.LBB217_1
+# %bb.10:                               # %destructor_block
+	xorl	%eax, %eax
+	leaq	-40(%rbp), %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end217:
+	.size	conv_y_par_for_conv_y_s0_x_xo_tile, .Lfunc_end217-conv_y_par_for_conv_y_s0_x_xo_tile
+                                        # -- End function
+	.section	.rodata.cst4,"aM",@progbits,4
+	.p2align	2, 0x0                          # -- Begin function conv_y
+.LCPI218_0:
+	.long	0x3fb8aa3b                      # float 1.44269502
+.LCPI218_1:
+	.long	0xbf317218                      # float -0.693147182
+.LCPI218_5:
+	.long	0x3f800000                      # float 1
+.LCPI218_6:
+	.long	0x7f800000                      # float +Inf
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4, 0x0
+.LCPI218_2:
+	.long	0x3a9c2e66                      # float 0.00119156833
+	.long	0x39a797f3                      # float 3.19659332E-4
+	.zero	4
+	.zero	4
+.LCPI218_3:
+	.long	0x3d2a66bc                      # float 0.0416018814
+	.long	0x3c0b192a                      # float 0.00848988629
+	.zero	4
+	.zero	4
+.LCPI218_4:
+	.long	0x3effffde                      # float 0.499998987
+	.long	0x3e2aae1f                      # float 0.166679844
+	.zero	4
+	.zero	4
+	.section	.text.conv_y,"ax",@progbits
+	.globl	conv_y
+	.p2align	4, 0x90
+	.type	conv_y,@function
+conv_y:                                 # @conv_y
+# %bb.0:                                # %entry
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	andq	$-32, %rsp
+	subq	$288, %rsp                      # imm = 0x120
+	testq	%rdi, %rdi
+	je	.LBB218_1
+# %bb.2:                                # %"assert succeeded"
+	movq	40(%rdi), %r9
+	movslq	4(%r9), %rbx
+	xorl	%r8d, %r8d
+	cmpq	$-63, %rbx
+	setl	%r8b
+	movl	20(%r9), %r10d
+	leal	63(%rbx), %esi
+	movl	%esi, %r11d
+	sarl	$6, %r11d
+	leal	63(%r10), %r13d
+	sarl	$6, %r13d
+	imull	%r11d, %r13d
+	xorl	%ecx, %ecx
+	testl	%r13d, %r13d
+	setle	%cl
+	leal	(%rcx,%r13), %eax
+	decl	%eax
+                                        # kill: def $ecx killed $ecx killed $rcx
+	negl	%ecx
+	cmpl	$64, %esi
+	movl	%r11d, 44(%rsp)                 # 4-byte Spill
+	adcl	$0, %r11d
+	cltd
+	idivl	%r11d
+	leal	(%r8,%r8), %r11d
+	decl	%r11d
+	andl	%ecx, %r11d
+	addl	%eax, %r11d
+	shll	$6, %r11d
+	xorl	%eax, %eax
+	cmpl	$64, %esi
+	movq	16(%rdi), %rsi
+	movq	24(%rdi), %rcx
+	movq	%rcx, 64(%rsp)                  # 8-byte Spill
+	movl	(%r9), %ecx
+	movq	%rcx, 24(%rsp)                  # 8-byte Spill
+	movl	8(%r9), %ecx
+	movl	%ecx, 20(%rsp)                  # 4-byte Spill
+	cmovbl	%eax, %r11d
+	movl	$-64, %ecx
+	subl	%ebx, %ecx
+	leal	-1(%rbx), %edx
+	cmpl	%edx, %ecx
+	cmovlel	%edx, %ecx
+	movl	%ecx, %edx
+	shrl	$6, %edx
+	sarl	$31, %ecx
+	andnl	%edx, %ecx, %r15d
+	shll	$6, %r15d
+	leal	-64(%rbx), %ecx
+	cmpl	%r15d, %ecx
+	cmovll	%ecx, %r15d
+	testq	%rbx, %rbx
+	movl	$0, %r8d
+	cmovgl	%r11d, %r8d
+	cmovgl	%eax, %r11d
+	leal	-64(%r10), %eax
+	cmpl	%r8d, %eax
+	cmovll	%eax, %r8d
+	cmpl	%r11d, %eax
+	cmovll	%eax, %r11d
+	movl	16(%r9), %edx
+	movl	24(%r9), %eax
+	movl	%eax, 40(%rsp)                  # 4-byte Spill
+	movl	32(%rdi), %r14d
+	movl	36(%rdi), %eax
+	movl	%eax, 16(%rsp)                  # 4-byte Spill
+	movq	%rsi, 72(%rsp)                  # 8-byte Spill
+	testq	%rsi, %rsi
+	jne	.LBB218_9
+# %bb.3:                                # %_halide_buffer_is_bounds_query.exit
+	cmpq	$0, (%rdi)
+	jne	.LBB218_9
+# %bb.4:                                # %then_bb
+	movl	%r15d, %ecx
+	sarl	$31, %ecx
+	andnl	%r15d, %ecx, %ecx
+	cmpl	$64, %ebx
+	movl	$64, %esi
+	cmovll	%ebx, %esi
+	movq	24(%rsp), %rax                  # 8-byte Reload
+	addl	%eax, %esi
+	addl	$-64, %esi
+	addl	$64, %ecx
+	movq	%rdx, %rax
+	leal	(%r11,%rdx), %r9d
+	movl	%r8d, %r12d
+	subl	%r11d, %r12d
+	addl	$64, %r12d
+	movl	%esi, 80(%rsp)
+	movl	%ecx, 84(%rsp)
+	movq	$1, 88(%rsp)
+	movl	%r9d, 96(%rsp)
+	movl	%r12d, 100(%rsp)
+	movl	%ecx, 104(%rsp)
+	movl	$0, 108(%rsp)
+	vxorps	%xmm0, %xmm0, %xmm0
+	vmovups	%xmm0, (%rdi)
+	movq	$0, 16(%rdi)
+	movabsq	$8590008322, %rcx               # imm = 0x200012002
+	movq	%rcx, 32(%rdi)
+	xorl	%ecx, %ecx
+	.p2align	4, 0x90
+.LBB218_5:                              # %for.body.i
+                                        # =>This Inner Loop Header: Depth=1
+	movq	40(%rdi), %rsi
+	vmovups	80(%rsp,%rcx), %xmm0
+	vmovups	%xmm0, (%rsi,%rcx)
+	addq	$16, %rcx
+	cmpq	$32, %rcx
+	jne	.LBB218_5
+# %bb.6:                                # %after_bb
+	movq	$0, 24(%rdi)
+	cmpq	$0, 16(%rdi)
+	movq	%rax, %rdx
+	jne	.LBB218_9
+# %bb.7:                                # %_halide_buffer_is_bounds_query.exit37
+	cmpq	$0, (%rdi)
+	je	.LBB218_8
+.LBB218_9:                              # %then_bb2
+	movl	%r14d, 36(%rsp)                 # 4-byte Spill
+	movq	%r13, 56(%rsp)                  # 8-byte Spill
+	movabsq	$-9223372036854775808, %r9      # imm = 0x8000000000000000
+	movq	%rdx, %rax
+	xorl	%r12d, %r12d
+	cmpl	$73730, %r14d                   # imm = 0x12002
+	setne	%r12b
+	movl	%r15d, %esi
+	sarl	$31, %esi
+	andnl	%r15d, %esi, %r15d
+	cmpl	$64, %ebx
+	movl	$64, %edi
+	movl	$64, %r14d
+	cmovll	%ebx, %r14d
+	addl	%r14d, %r15d
+	cmpl	$65, %r15d
+	cmovgel	%r15d, %edi
+	xorl	%esi, %esi
+	cmpl	%ebx, %edi
+	setg	%sil
+	testl	%r11d, %r11d
+	sets	%cl
+	leal	(%r8,%rdx), %edi
+	addl	$64, %edi
+	movq	%rdx, 48(%rsp)                  # 8-byte Spill
+	addl	%r10d, %edx
+	cmpl	%edx, %edi
+	setg	%dil
+	orb	%cl, %dil
+	movq	%r10, %rcx
+	shrq	$27, %rcx
+	andl	$-16, %ecx
+	xorl	%eax, %eax
+	cmpl	$1, 20(%rsp)                    # 4-byte Folded Reload
+	setne	%al
+	shlq	$5, %rax
+	movq	%r10, %r13
+	leaq	2(%r9), %r10
+	cmpl	$2, 16(%rsp)                    # 4-byte Folded Reload
+	cmoveq	%r9, %r10
+	orq	%r12, %r10
+	orq	%rax, %r10
+	orq	%rcx, %r10
+	leaq	(%r10,%rsi,4), %rax
+	movzbl	%dil, %ecx
+	leaq	(%rax,%rcx,8), %rax
+	xorl	%esi, %esi
+	tzcntq	%rax, %rsi
+	cmpl	$5, %esi
+	jbe	.LBB218_10
+# %bb.11:                               # %no_errors_bb
+	movl	%ebx, %r9d
+	movslq	%r13d, %rdx
+	movl	40(%rsp), %r11d                 # 4-byte Reload
+	movslq	%r11d, %rax
+	shrq	$31, %r9
+	imulq	%rdx, %rax
+	imulq	%rbx, %rdx
+	movq	%rax, %r8
+	negq	%r8
+	cmovsq	%rax, %r8
+	xorl	%esi, %esi
+	cmpq	$2147483647, %r8                # imm = 0x7FFFFFFF
+	seta	%sil
+	addq	%rsi, %rsi
+	xorl	%eax, %eax
+	cmpq	$2147483647, %rdx               # imm = 0x7FFFFFFF
+	setg	%al
+	shlq	$2, %rax
+	movq	64(%rsp), %rdi                  # 8-byte Reload
+	shll	$2, %edi
+	andl	$8, %edi
+	movabsq	$-9223372036854775808, %r14     # imm = 0x8000000000000000
+	leaq	16(%r14), %rcx
+	movq	72(%rsp), %r10                  # 8-byte Reload
+	testq	%r10, %r10
+	cmovneq	%r14, %rcx
+	orq	%rdi, %rcx
+	orq	%r9, %rcx
+	orq	%rax, %rcx
+	orq	%rsi, %rcx
+	tzcntq	%rcx, %rcx
+	cmpl	$4, %ecx
+	jbe	.LBB218_12
+# %bb.26:                               # %"6_for_kernel.s0.x.preheader"
+	leaq	160(%rsp), %rdx
+	xorl	%edi, %edi
+	vmovss	.LCPI218_0(%rip), %xmm0         # xmm0 = mem[0],zero,zero,zero
+	vmovss	.LCPI218_1(%rip), %xmm1         # xmm1 = mem[0],zero,zero,zero
+	vmovaps	.LCPI218_2(%rip), %xmm2         # xmm2 = <1.19156833E-3,3.19659332E-4,u,u>
+	vmovaps	.LCPI218_3(%rip), %xmm3         # xmm3 = <4.16018814E-2,8.48988629E-3,u,u>
+	vmovaps	.LCPI218_4(%rip), %xmm4         # xmm4 = <4.99998987E-1,1.66679844E-1,u,u>
+	vbroadcastss	.LCPI218_5(%rip), %xmm5 # xmm5 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+	vmovss	.LCPI218_6(%rip), %xmm6         # xmm6 = mem[0],zero,zero,zero
+	movq	56(%rsp), %rcx                  # 8-byte Reload
+	movq	48(%rsp), %r9                   # 8-byte Reload
+	jmp	.LBB218_27
+	.p2align	4, 0x90
+.LBB218_32:                             # %"6_for_kernel.s0.x"
+                                        #   in Loop: Header=BB218_27 Depth=1
+	vmovss	%xmm7, (%rdx)
+	decq	%rdi
+	addq	$4, %rdx
+	cmpq	$-20, %rdi
+	je	.LBB218_33
+.LBB218_27:                             # %"6_for_kernel.s0.x"
+                                        # =>This Inner Loop Header: Depth=1
+	vcvtsi2ss	%edi, %xmm10, %xmm7
+	vmulss	%xmm0, %xmm7, %xmm8
+	vroundss	$9, %xmm8, %xmm8, %xmm8
+	vcvttss2si	%xmm8, %r8d
+	vfmadd231ss	%xmm1, %xmm8, %xmm7     # xmm7 = (xmm8 * xmm1) + xmm7
+	vmulss	%xmm7, %xmm7, %xmm8
+	vbroadcastss	%xmm8, %xmm8
+	vmovaps	%xmm2, %xmm9
+	vfmadd213ps	%xmm3, %xmm8, %xmm9     # xmm9 = (xmm8 * xmm9) + xmm3
+	vfmadd213ps	%xmm4, %xmm8, %xmm9     # xmm9 = (xmm8 * xmm9) + xmm4
+	cmpl	$128, %r8d
+	jl	.LBB218_28
+# %bb.29:                               # %"6_for_kernel.s0.x"
+                                        #   in Loop: Header=BB218_27 Depth=1
+	vmovaps	%xmm6, %xmm7
+	cmpl	$-126, %r8d
+	jge	.LBB218_32
+	jmp	.LBB218_31
+	.p2align	4, 0x90
+.LBB218_28:                             #   in Loop: Header=BB218_27 Depth=1
+	movl	%r8d, %eax
+	shll	$23, %eax
+	addl	$1065353216, %eax               # imm = 0x3F800000
+	vfmadd213ps	%xmm5, %xmm9, %xmm8     # xmm8 = (xmm9 * xmm8) + xmm5
+	vmovshdup	%xmm8, %xmm9            # xmm9 = xmm8[1,1,3,3]
+	vfmadd213ss	%xmm8, %xmm9, %xmm7     # xmm7 = (xmm9 * xmm7) + xmm8
+	vmovd	%eax, %xmm8
+	vmulss	%xmm7, %xmm8, %xmm7
+	cmpl	$-126, %r8d
+	jge	.LBB218_32
+.LBB218_31:                             # %"6_for_kernel.s0.x"
+                                        #   in Loop: Header=BB218_27 Depth=1
+	vxorps	%xmm7, %xmm7, %xmm7
+	jmp	.LBB218_32
+.LBB218_33:                             # %"6_end_for_kernel.s0.x"
+	movl	%r9d, %eax
+	imull	%r11d, %eax
+	negl	%eax
+	movq	24(%rsp), %rdi                  # 8-byte Reload
+	addl	%r9d, %edi
+	movq	%r10, 112(%rsp)
+	leaq	160(%rsp), %rdx
+	movq	%rdx, 120(%rsp)
+	movl	%ebx, 128(%rsp)
+	movl	%r13d, 132(%rsp)
+	movl	%r9d, 136(%rsp)
+	movl	%r11d, 140(%rsp)
+	movl	44(%rsp), %edx                  # 4-byte Reload
+	movl	%edx, 144(%rsp)
+	movl	%edi, 148(%rsp)
+	movl	%eax, 152(%rsp)
+	movq	conv_y_par_for_conv_y_s0_x_xo_tile@GOTPCREL(%rip), %rsi
+	leaq	112(%rsp), %r8
+	xorl	%edi, %edi
+	xorl	%edx, %edx
+                                        # kill: def $ecx killed $ecx killed $rcx
+	callq	halide_do_par_for@PLT
+.LBB218_34:                             # %common.ret
+	leaq	-40(%rbp), %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.LBB218_8:
+	xorl	%eax, %eax
+	jmp	.LBB218_34
+.LBB218_1:                              # %"assert failed"
+	leaq	.Lstr(%rip), %rsi
+	xorl	%edi, %edi
+	callq	halide_error_buffer_argument_is_null@PLT
+	jmp	.LBB218_34
+.LBB218_10:                             # %then_bb2
+	movl	%edx, %edi
+	movl	36(%rsp), %edx                  # 4-byte Reload
+	leaq	.LJTI218_0(%rip), %rax
+	movslq	(%rax,%rsi,4), %rsi
+	addq	%rax, %rsi
+	jmpq	*%rsi
+.LBB218_13:                             # %assert_failed
+	leaq	.Lstr.204(%rip), %rsi
+	xorl	%edi, %edi
+	movl	$73730, %ecx                    # imm = 0x12002
+	callq	halide_error_bad_type@PLT
+	jmp	.LBB218_34
+.LBB218_12:                             # %no_errors_bb
+	leaq	.LJTI218_1(%rip), %rax
+	movslq	(%rax,%rcx,4), %rcx
+	addq	%rax, %rcx
+	jmpq	*%rcx
+.LBB218_20:                             # %assert_failed10
+	leaq	.Lstr(%rip), %rsi
+	movl	$2147483647, %ecx               # imm = 0x7FFFFFFF
+	xorl	%edi, %edi
+	movq	%rbx, %rdx
+	callq	halide_error_buffer_allocation_too_large@PLT
+	jmp	.LBB218_34
+.LBB218_22:                             # %assert_failed11
+	leaq	.Lstr(%rip), %rsi
+	movl	$2147483647, %ecx               # imm = 0x7FFFFFFF
+	xorl	%edi, %edi
+	movq	%r8, %rdx
+	callq	halide_error_buffer_allocation_too_large@PLT
+	jmp	.LBB218_34
+.LBB218_23:                             # %assert_failed12
+	leaq	.Lstr(%rip), %rsi
+	movl	$2147483647, %ecx               # imm = 0x7FFFFFFF
+	xorl	%edi, %edi
+	callq	halide_error_buffer_extents_too_large@PLT
+	jmp	.LBB218_34
+.LBB218_24:                             # %assert_failed13
+	leaq	.Lstr.204(%rip), %rsi
+	xorl	%edi, %edi
+	callq	halide_error_device_dirty_with_no_device_support@PLT
+	jmp	.LBB218_34
+.LBB218_25:                             # %assert_failed14
+	leaq	.Lstr.204(%rip), %rsi
+	xorl	%edi, %edi
+	callq	halide_error_host_is_null@PLT
+	jmp	.LBB218_34
+.LBB218_14:                             # %assert_failed4
+	leaq	.Lstr.204(%rip), %rsi
+	xorl	%edi, %edi
+	movl	16(%rsp), %edx                  # 4-byte Reload
+	movl	$2, %ecx
+	callq	halide_error_bad_dimensions@PLT
+	jmp	.LBB218_34
+.LBB218_15:                             # %assert_failed5
+	movq	24(%rsp), %r9                   # 8-byte Reload
+	leal	(%r9,%r14), %ecx
+	addl	$-64, %ecx
+	leal	-1(%r9,%r15), %r8d
+	leal	(%r9,%rbx), %eax
+	decl	%eax
+	movl	%eax, (%rsp)
+	leaq	.Lstr.204(%rip), %rsi
+	xorl	%edi, %edi
+	xorl	%edx, %edx
+                                        # kill: def $r9d killed $r9d killed $r9
+	callq	halide_error_access_out_of_bounds@PLT
+	jmp	.LBB218_34
+.LBB218_17:                             # %assert_failed6
+	movq	48(%rsp), %r9                   # 8-byte Reload
+	addl	%r9d, %r8d
+	addl	%r9d, %r11d
+	addl	$63, %r8d
+	decl	%edi
+	movl	%edi, (%rsp)
+	leaq	.Lstr.204(%rip), %rsi
+	xorl	%edi, %edi
+	movl	$1, %edx
+	movl	%r11d, %ecx
+                                        # kill: def $r8d killed $r8d killed $r8
+                                        # kill: def $r9d killed $r9d killed $r9
+	callq	halide_error_access_out_of_bounds@PLT
+	jmp	.LBB218_34
+.LBB218_18:                             # %assert_failed7
+	leaq	.Lstr.204(%rip), %rsi
+	xorl	%edi, %edi
+	movl	$1, %edx
+	movl	%r13d, %ecx
+	callq	halide_error_buffer_extents_negative@PLT
+	jmp	.LBB218_34
+.LBB218_19:                             # %assert_failed8
+	leaq	.Lstr.205(%rip), %rsi
+	leaq	.Lstr.206(%rip), %rcx
+	xorl	%edi, %edi
+	movl	20(%rsp), %edx                  # 4-byte Reload
+	movl	$1, %r8d
+	callq	halide_error_constraint_violated@PLT
+	jmp	.LBB218_34
+.Lfunc_end218:
+	.size	conv_y, .Lfunc_end218-conv_y
+	.section	.rodata.conv_y,"a",@progbits
+	.p2align	2, 0x0
+.LJTI218_0:
+	.long	.LBB218_13-.LJTI218_0
+	.long	.LBB218_14-.LJTI218_0
+	.long	.LBB218_15-.LJTI218_0
+	.long	.LBB218_17-.LJTI218_0
+	.long	.LBB218_18-.LJTI218_0
+	.long	.LBB218_19-.LJTI218_0
+.LJTI218_1:
+	.long	.LBB218_20-.LJTI218_1
+	.long	.LBB218_22-.LJTI218_1
+	.long	.LBB218_23-.LJTI218_1
+	.long	.LBB218_24-.LJTI218_1
+	.long	.LBB218_25-.LJTI218_1
+                                        # -- End function
+	.section	.text.conv_y_argv,"ax",@progbits
+	.globl	conv_y_argv                     # -- Begin function conv_y_argv
+	.p2align	4, 0x90
+	.type	conv_y_argv,@function
+conv_y_argv:                            # @conv_y_argv
+# %bb.0:                                # %entry
+	movq	(%rdi), %rdi
+	jmp	conv_y@PLT                      # TAILCALL
+.Lfunc_end219:
+	.size	conv_y_argv, .Lfunc_end219-conv_y_argv
+                                        # -- End function
+	.section	.text.conv_y_metadata,"ax",@progbits
+	.globl	conv_y_metadata                 # -- Begin function conv_y_metadata
+	.p2align	4, 0x90
+	.type	conv_y_metadata,@function
+conv_y_metadata:                        # @conv_y_metadata
+# %bb.0:                                # %entry
+	leaq	.Lconv_y_metadata_storage(%rip), %rax
+	retq
+.Lfunc_end220:
+	.size	conv_y_metadata, .Lfunc_end220-conv_y_metadata
+                                        # -- End function
+	.type	_ZN6Halide7Runtime8Internal13custom_mallocE,@object # @_ZN6Halide7Runtime8Internal13custom_mallocE
+	.data
+	.weak	_ZN6Halide7Runtime8Internal13custom_mallocE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal13custom_mallocE:
+	.quad	halide_default_malloc
+	.size	_ZN6Halide7Runtime8Internal13custom_mallocE, 8
+
+	.type	_ZN6Halide7Runtime8Internal11custom_freeE,@object # @_ZN6Halide7Runtime8Internal11custom_freeE
+	.weak	_ZN6Halide7Runtime8Internal11custom_freeE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal11custom_freeE:
+	.quad	halide_default_free
+	.size	_ZN6Halide7Runtime8Internal11custom_freeE, 8
+
+	.type	.L.str,@object                  # @.str
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str:
+	.asciz	"Error: "
+	.size	.L.str, 8
+
+	.type	_ZN6Halide7Runtime8Internal13error_handlerE,@object # @_ZN6Halide7Runtime8Internal13error_handlerE
+	.data
+	.weak	_ZN6Halide7Runtime8Internal13error_handlerE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal13error_handlerE:
+	.quad	halide_default_error
+	.size	_ZN6Halide7Runtime8Internal13error_handlerE, 8
+
+	.type	_ZN6Halide7Runtime8Internal12custom_printE,@object # @_ZN6Halide7Runtime8Internal12custom_printE
+	.weak	_ZN6Halide7Runtime8Internal12custom_printE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal12custom_printE:
+	.quad	halide_default_print
+	.size	_ZN6Halide7Runtime8Internal12custom_printE, 8
+
+	.type	halide_reference_clock_inited,@object # @halide_reference_clock_inited
+	.bss
+	.weak	halide_reference_clock_inited
+halide_reference_clock_inited:
+	.byte	0                               # 0x0
+	.size	halide_reference_clock_inited, 1
+
+	.type	halide_reference_clock,@object  # @halide_reference_clock
+	.weak	halide_reference_clock
+	.p2align	3, 0x0
+halide_reference_clock:
+	.zero	16
+	.size	halide_reference_clock, 16
+
+	.section	.fini_array,"aw",@fini_array
+	.p2align	3, 0x90
+	.quad	halide_thread_pool_cleanup
+	.quad	halide_trace_cleanup
+	.quad	halide_cache_cleanup
+	.quad	halide_profiler_shutdown
+	.type	.L.str.5,@object                # @.str.5
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.5:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/synchronization_common.h:251 halide_abort_if_false() failed: next != nullptr\n"
+	.size	.L.str.5, 187
+
+	.type	_ZN6Halide7Runtime8Internal15Synchronization5tableE,@object # @_ZN6Halide7Runtime8Internal15Synchronization5tableE
+	.bss
+	.weak	_ZN6Halide7Runtime8Internal15Synchronization5tableE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal15Synchronization5tableE:
+	.zero	24576
+	.size	_ZN6Halide7Runtime8Internal15Synchronization5tableE, 24576
+
+	.type	.L.str.1,@object                # @.str.1
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.1:
+	.asciz	"HL_NUM_THREADS"
+	.size	.L.str.1, 15
+
+	.type	.L.str.2,@object                # @.str.2
+.L.str.2:
+	.asciz	"HL_NUMTHREADS"
+	.size	.L.str.2, 14
+
+	.type	_ZN6Halide7Runtime8Internal10work_queueE,@object # @_ZN6Halide7Runtime8Internal10work_queueE
+	.bss
+	.weak	_ZN6Halide7Runtime8Internal10work_queueE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal10work_queueE:
+	.zero	8
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.quad	0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.zero	4
+	.zero	8
+	.zero	8
+	.zero	8
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.zero	2048
+	.byte	0                               # 0x0
+	.byte	0                               # 0x0
+	.zero	2
+	.long	0                               # 0x0
+	.size	_ZN6Halide7Runtime8Internal10work_queueE, 2128
+
+	.type	.L.str.3,@object                # @.str.3
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.3:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/thread_pool_common.h:527 halide_abort_if_false() failed: (min_threads <= ((task_parent->task.min_threads * task_parent->active_workers) - task_parent->threads_reserved)) && \"Logic error: thread over commit.\\n\"\n"
+	.size	.L.str.3, 320
+
+	.type	_ZN6Halide7Runtime8Internal14custom_do_taskE,@object # @_ZN6Halide7Runtime8Internal14custom_do_taskE
+	.data
+	.weak	_ZN6Halide7Runtime8Internal14custom_do_taskE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal14custom_do_taskE:
+	.quad	halide_default_do_task
+	.size	_ZN6Halide7Runtime8Internal14custom_do_taskE, 8
+
+	.type	_ZN6Halide7Runtime8Internal19custom_do_loop_taskE,@object # @_ZN6Halide7Runtime8Internal19custom_do_loop_taskE
+	.weak	_ZN6Halide7Runtime8Internal19custom_do_loop_taskE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal19custom_do_loop_taskE:
+	.quad	halide_default_do_loop_task
+	.size	_ZN6Halide7Runtime8Internal19custom_do_loop_taskE, 8
+
+	.type	_ZN6Halide7Runtime8Internal17custom_do_par_forE,@object # @_ZN6Halide7Runtime8Internal17custom_do_par_forE
+	.weak	_ZN6Halide7Runtime8Internal17custom_do_par_forE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal17custom_do_par_forE:
+	.quad	halide_default_do_par_for
+	.size	_ZN6Halide7Runtime8Internal17custom_do_par_forE, 8
+
+	.type	_ZN6Halide7Runtime8Internal24custom_do_parallel_tasksE,@object # @_ZN6Halide7Runtime8Internal24custom_do_parallel_tasksE
+	.weak	_ZN6Halide7Runtime8Internal24custom_do_parallel_tasksE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal24custom_do_parallel_tasksE:
+	.quad	halide_default_do_parallel_tasks
+	.size	_ZN6Halide7Runtime8Internal24custom_do_parallel_tasksE, 8
+
+	.type	_ZN6Halide7Runtime8Internal21custom_semaphore_initE,@object # @_ZN6Halide7Runtime8Internal21custom_semaphore_initE
+	.weak	_ZN6Halide7Runtime8Internal21custom_semaphore_initE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal21custom_semaphore_initE:
+	.quad	halide_default_semaphore_init
+	.size	_ZN6Halide7Runtime8Internal21custom_semaphore_initE, 8
+
+	.type	_ZN6Halide7Runtime8Internal28custom_semaphore_try_acquireE,@object # @_ZN6Halide7Runtime8Internal28custom_semaphore_try_acquireE
+	.weak	_ZN6Halide7Runtime8Internal28custom_semaphore_try_acquireE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal28custom_semaphore_try_acquireE:
+	.quad	halide_default_semaphore_try_acquire
+	.size	_ZN6Halide7Runtime8Internal28custom_semaphore_try_acquireE, 8
+
+	.type	_ZN6Halide7Runtime8Internal24custom_semaphore_releaseE,@object # @_ZN6Halide7Runtime8Internal24custom_semaphore_releaseE
+	.weak	_ZN6Halide7Runtime8Internal24custom_semaphore_releaseE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal24custom_semaphore_releaseE:
+	.quad	halide_default_semaphore_release
+	.size	_ZN6Halide7Runtime8Internal24custom_semaphore_releaseE, 8
+
+	.type	.L.str.4,@object                # @.str.4
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.4:
+	.asciz	"halide_set_num_threads: must be >= 0."
+	.size	.L.str.4, 38
+
+	.type	_ZTVN6Halide7Runtime8Internal15Synchronization21mutex_parking_controlE,@object # @_ZTVN6Halide7Runtime8Internal15Synchronization21mutex_parking_controlE
+	.section	.data.rel.ro._ZTVN6Halide7Runtime8Internal15Synchronization21mutex_parking_controlE,"aGw",@progbits,_ZTVN6Halide7Runtime8Internal15Synchronization21mutex_parking_controlE,comdat
+	.weak	_ZTVN6Halide7Runtime8Internal15Synchronization21mutex_parking_controlE
+	.p2align	3, 0x0
+_ZTVN6Halide7Runtime8Internal15Synchronization21mutex_parking_controlE:
+	.quad	0
+	.quad	0
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control8validateERNS2_15validate_actionE
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization21mutex_parking_control6unparkEib
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb
+	.size	_ZTVN6Halide7Runtime8Internal15Synchronization21mutex_parking_controlE, 48
+
+	.type	_ZTVN6Halide7Runtime8Internal15Synchronization25broadcast_parking_controlE,@object # @_ZTVN6Halide7Runtime8Internal15Synchronization25broadcast_parking_controlE
+	.section	.data.rel.ro._ZTVN6Halide7Runtime8Internal15Synchronization25broadcast_parking_controlE,"aGw",@progbits,_ZTVN6Halide7Runtime8Internal15Synchronization25broadcast_parking_controlE,comdat
+	.weak	_ZTVN6Halide7Runtime8Internal15Synchronization25broadcast_parking_controlE
+	.p2align	3, 0x0
+_ZTVN6Halide7Runtime8Internal15Synchronization25broadcast_parking_controlE:
+	.quad	0
+	.quad	0
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control8validateERNS2_15validate_actionE
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization15parking_control6unparkEib
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization25broadcast_parking_control16requeue_callbackERKNS2_15validate_actionEbb
+	.size	_ZTVN6Halide7Runtime8Internal15Synchronization25broadcast_parking_controlE, 48
+
+	.type	_ZTVN6Halide7Runtime8Internal15Synchronization22signal_parking_controlE,@object # @_ZTVN6Halide7Runtime8Internal15Synchronization22signal_parking_controlE
+	.section	.data.rel.ro._ZTVN6Halide7Runtime8Internal15Synchronization22signal_parking_controlE,"aGw",@progbits,_ZTVN6Halide7Runtime8Internal15Synchronization22signal_parking_controlE,comdat
+	.weak	_ZTVN6Halide7Runtime8Internal15Synchronization22signal_parking_controlE
+	.p2align	3, 0x0
+_ZTVN6Halide7Runtime8Internal15Synchronization22signal_parking_controlE:
+	.quad	0
+	.quad	0
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization15parking_control8validateERNS2_15validate_actionE
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization15parking_control12before_sleepEv
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization22signal_parking_control6unparkEib
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb
+	.size	_ZTVN6Halide7Runtime8Internal15Synchronization22signal_parking_controlE, 48
+
+	.type	.L.str.5.6,@object              # @.str.5.6
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.5.6:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/synchronization_common.h:859 halide_abort_if_false() failed: val & 0x1\n"
+	.size	.L.str.5.6, 181
+
+	.type	_ZTVN6Halide7Runtime8Internal15Synchronization20wait_parking_controlE,@object # @_ZTVN6Halide7Runtime8Internal15Synchronization20wait_parking_controlE
+	.section	.data.rel.ro._ZTVN6Halide7Runtime8Internal15Synchronization20wait_parking_controlE,"aGw",@progbits,_ZTVN6Halide7Runtime8Internal15Synchronization20wait_parking_controlE,comdat
+	.weak	_ZTVN6Halide7Runtime8Internal15Synchronization20wait_parking_controlE
+	.p2align	3, 0x0
+_ZTVN6Halide7Runtime8Internal15Synchronization20wait_parking_controlE:
+	.quad	0
+	.quad	0
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control8validateERNS2_15validate_actionE
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control12before_sleepEv
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization20wait_parking_control6unparkEib
+	.quad	_ZN6Halide7Runtime8Internal15Synchronization15parking_control16requeue_callbackERKNS2_15validate_actionEbb
+	.size	_ZTVN6Halide7Runtime8Internal15Synchronization20wait_parking_controlE, 48
+
+	.type	.L.str.6,@object                # @.str.6
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.6:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/thread_pool_common.h:155 halide_abort_if_false() failed: bytes == limit && \"Logic error in thread pool work queue initialization.\\n\"\n"
+	.size	.L.str.6, 243
+
+	.type	_ZN6Halide7Runtime8Internal17custom_get_symbolE,@object # @_ZN6Halide7Runtime8Internal17custom_get_symbolE
+	.data
+	.weak	_ZN6Halide7Runtime8Internal17custom_get_symbolE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal17custom_get_symbolE:
+	.quad	halide_default_get_symbol
+	.size	_ZN6Halide7Runtime8Internal17custom_get_symbolE, 8
+
+	.type	_ZN6Halide7Runtime8Internal19custom_load_libraryE,@object # @_ZN6Halide7Runtime8Internal19custom_load_libraryE
+	.weak	_ZN6Halide7Runtime8Internal19custom_load_libraryE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal19custom_load_libraryE:
+	.quad	halide_default_load_library
+	.size	_ZN6Halide7Runtime8Internal19custom_load_libraryE, 8
+
+	.type	_ZN6Halide7Runtime8Internal25custom_get_library_symbolE,@object # @_ZN6Halide7Runtime8Internal25custom_get_library_symbolE
+	.weak	_ZN6Halide7Runtime8Internal25custom_get_library_symbolE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal25custom_get_library_symbolE:
+	.quad	halide_default_get_library_symbol
+	.size	_ZN6Halide7Runtime8Internal25custom_get_library_symbolE, 8
+
+	.type	_ZN6Halide7Runtime8Internal17halide_gpu_deviceE,@object # @_ZN6Halide7Runtime8Internal17halide_gpu_deviceE
+	.bss
+	.weak	_ZN6Halide7Runtime8Internal17halide_gpu_deviceE
+	.p2align	2, 0x0
+_ZN6Halide7Runtime8Internal17halide_gpu_deviceE:
+	.long	0                               # 0x0
+	.size	_ZN6Halide7Runtime8Internal17halide_gpu_deviceE, 4
+
+	.type	_ZN6Halide7Runtime8Internal22halide_gpu_device_lockE,@object # @_ZN6Halide7Runtime8Internal22halide_gpu_device_lockE
+	.weak	_ZN6Halide7Runtime8Internal22halide_gpu_device_lockE
+_ZN6Halide7Runtime8Internal22halide_gpu_device_lockE:
+	.byte	0                               # 0x0
+	.size	_ZN6Halide7Runtime8Internal22halide_gpu_device_lockE, 1
+
+	.type	_ZN6Halide7Runtime8Internal29halide_gpu_device_initializedE,@object # @_ZN6Halide7Runtime8Internal29halide_gpu_device_initializedE
+	.weak	_ZN6Halide7Runtime8Internal29halide_gpu_device_initializedE
+_ZN6Halide7Runtime8Internal29halide_gpu_device_initializedE:
+	.byte	0                               # 0x0
+	.size	_ZN6Halide7Runtime8Internal29halide_gpu_device_initializedE, 1
+
+	.type	.L.str.8,@object                # @.str.8
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.8:
+	.asciz	"HL_GPU_DEVICE"
+	.size	.L.str.8, 14
+
+	.type	_ZN6Halide7Runtime8Internal19halide_trace_bufferE,@object # @_ZN6Halide7Runtime8Internal19halide_trace_bufferE
+	.bss
+	.weak	_ZN6Halide7Runtime8Internal19halide_trace_bufferE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal19halide_trace_bufferE:
+	.quad	0
+	.size	_ZN6Halide7Runtime8Internal19halide_trace_bufferE, 8
+
+	.type	_ZN6Halide7Runtime8Internal17halide_trace_fileE,@object # @_ZN6Halide7Runtime8Internal17halide_trace_fileE
+	.data
+	.weak	_ZN6Halide7Runtime8Internal17halide_trace_fileE
+	.p2align	2, 0x0
+_ZN6Halide7Runtime8Internal17halide_trace_fileE:
+	.long	4294967295                      # 0xffffffff
+	.size	_ZN6Halide7Runtime8Internal17halide_trace_fileE, 4
+
+	.type	_ZN6Halide7Runtime8Internal22halide_trace_file_lockE,@object # @_ZN6Halide7Runtime8Internal22halide_trace_file_lockE
+	.bss
+	.weak	_ZN6Halide7Runtime8Internal22halide_trace_file_lockE
+_ZN6Halide7Runtime8Internal22halide_trace_file_lockE:
+	.byte	0                               # 0x0
+	.size	_ZN6Halide7Runtime8Internal22halide_trace_file_lockE, 1
+
+	.type	_ZN6Halide7Runtime8Internal29halide_trace_file_initializedE,@object # @_ZN6Halide7Runtime8Internal29halide_trace_file_initializedE
+	.weak	_ZN6Halide7Runtime8Internal29halide_trace_file_initializedE
+_ZN6Halide7Runtime8Internal29halide_trace_file_initializedE:
+	.byte	0                               # 0x0
+	.size	_ZN6Halide7Runtime8Internal29halide_trace_file_initializedE, 1
+
+	.type	_ZN6Halide7Runtime8Internal35halide_trace_file_internally_openedE,@object # @_ZN6Halide7Runtime8Internal35halide_trace_file_internally_openedE
+	.weak	_ZN6Halide7Runtime8Internal35halide_trace_file_internally_openedE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal35halide_trace_file_internally_openedE:
+	.quad	0
+	.size	_ZN6Halide7Runtime8Internal35halide_trace_file_internally_openedE, 8
+
+	.type	_ZZ20halide_default_traceE3ids,@object # @_ZZ20halide_default_traceE3ids
+	.data
+	.p2align	2, 0x0
+_ZZ20halide_default_traceE3ids:
+	.long	1                               # 0x1
+	.size	_ZZ20halide_default_traceE3ids, 4
+
+	.type	.L.str.1.10,@object             # @.str.1.10
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.1.10:
+	.zero	1
+	.size	.L.str.1.10, 1
+
+	.type	.L.str.2.11,@object             # @.str.2.11
+.L.str.2.11:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/tracing.cpp:238 halide_abort_if_false() failed: print_bits <= 64 && \"Tracing bad type\"\n"
+	.size	.L.str.2.11, 197
+
+	.type	.L.str.3.12,@object             # @.str.3.12
+.L.str.3.12:
+	.asciz	"Load"
+	.size	.L.str.3.12, 5
+
+	.type	.L.str.4.13,@object             # @.str.4.13
+.L.str.4.13:
+	.asciz	"Store"
+	.size	.L.str.4.13, 6
+
+	.type	.L.str.5.14,@object             # @.str.5.14
+.L.str.5.14:
+	.asciz	"Begin realization"
+	.size	.L.str.5.14, 18
+
+	.type	.L.str.6.15,@object             # @.str.6.15
+.L.str.6.15:
+	.asciz	"End realization"
+	.size	.L.str.6.15, 16
+
+	.type	.L.str.7,@object                # @.str.7
+.L.str.7:
+	.asciz	"Produce"
+	.size	.L.str.7, 8
+
+	.type	.L.str.8.16,@object             # @.str.8.16
+.L.str.8.16:
+	.asciz	"End produce"
+	.size	.L.str.8.16, 12
+
+	.type	.L.str.9.17,@object             # @.str.9.17
+.L.str.9.17:
+	.asciz	"Consume"
+	.size	.L.str.9.17, 8
+
+	.type	.L.str.10,@object               # @.str.10
+.L.str.10:
+	.asciz	"End consume"
+	.size	.L.str.10, 12
+
+	.type	.L.str.11,@object               # @.str.11
+.L.str.11:
+	.asciz	"Begin pipeline"
+	.size	.L.str.11, 15
+
+	.type	.L.str.12,@object               # @.str.12
+.L.str.12:
+	.asciz	"End pipeline"
+	.size	.L.str.12, 13
+
+	.type	.L.str.13,@object               # @.str.13
+.L.str.13:
+	.asciz	"Tag"
+	.size	.L.str.13, 4
+
+	.type	.Lreltable.halide_default_trace,@object # @reltable.halide_default_trace
+	.section	.rodata,"a",@progbits
+	.p2align	2, 0x0
+.Lreltable.halide_default_trace:
+	.long	.L.str.3.12-.Lreltable.halide_default_trace
+	.long	.L.str.4.13-.Lreltable.halide_default_trace
+	.long	.L.str.5.14-.Lreltable.halide_default_trace
+	.long	.L.str.6.15-.Lreltable.halide_default_trace
+	.long	.L.str.7-.Lreltable.halide_default_trace
+	.long	.L.str.8.16-.Lreltable.halide_default_trace
+	.long	.L.str.9.17-.Lreltable.halide_default_trace
+	.long	.L.str.10-.Lreltable.halide_default_trace
+	.long	.L.str.11-.Lreltable.halide_default_trace
+	.long	.L.str.12-.Lreltable.halide_default_trace
+	.long	.L.str.13-.Lreltable.halide_default_trace
+	.size	.Lreltable.halide_default_trace, 44
+
+	.type	.L.str.17,@object               # @.str.17
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.17:
+	.asciz	"<"
+	.size	.L.str.17, 2
+
+	.type	.L.str.18,@object               # @.str.18
+.L.str.18:
+	.asciz	">, <"
+	.size	.L.str.18, 5
+
+	.type	.L.str.20,@object               # @.str.20
+.L.str.20:
+	.asciz	">)"
+	.size	.L.str.20, 3
+
+	.type	.L.str.22,@object               # @.str.22
+.L.str.22:
+	.asciz	" = <"
+	.size	.L.str.22, 5
+
+	.type	.L.str.23,@object               # @.str.23
+.L.str.23:
+	.asciz	" = "
+	.size	.L.str.23, 4
+
+	.type	.L.str.24,@object               # @.str.24
+.L.str.24:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/tracing.cpp:307 halide_abort_if_false() failed: print_bits >= 16 && \"Tracing a bad type\"\n"
+	.size	.L.str.24, 199
+
+	.type	.L.str.25,@object               # @.str.25
+.L.str.25:
+	.asciz	">"
+	.size	.L.str.25, 2
+
+	.type	.L.str.26,@object               # @.str.26
+.L.str.26:
+	.asciz	" tag = \""
+	.size	.L.str.26, 9
+
+	.type	.L.str.27,@object               # @.str.27
+.L.str.27:
+	.asciz	"\""
+	.size	.L.str.27, 2
+
+	.type	_ZN6Halide7Runtime8Internal19halide_custom_traceE,@object # @_ZN6Halide7Runtime8Internal19halide_custom_traceE
+	.data
+	.weak	_ZN6Halide7Runtime8Internal19halide_custom_traceE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal19halide_custom_traceE:
+	.quad	halide_default_trace
+	.size	_ZN6Halide7Runtime8Internal19halide_custom_traceE, 8
+
+	.type	.L.str.28,@object               # @.str.28
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.28:
+	.asciz	"HL_TRACE_FILE"
+	.size	.L.str.28, 14
+
+	.type	.L.str.29,@object               # @.str.29
+.L.str.29:
+	.asciz	"ab"
+	.size	.L.str.29, 3
+
+	.type	.L.str.30,@object               # @.str.30
+.L.str.30:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/tracing.cpp:371 halide_abort_if_false() failed: file && \"Failed to open trace file\\n\"\n"
+	.size	.L.str.30, 196
+
+	.type	.L.str.31,@object               # @.str.31
+.L.str.31:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/tracing.cpp:103 halide_abort_if_false() failed: size <= buffer_size\n"
+	.size	.L.str.31, 178
+
+	.type	.L.str.32,@object               # @.str.32
+.L.str.32:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/tracing.cpp:131 halide_abort_if_false() failed: success && \"Could not write to trace file\"\n"
+	.size	.L.str.32, 201
+
+	.type	_ZN6Halide7Runtime8Internal30pixel_type_to_tiff_sample_typeE,@object # @_ZN6Halide7Runtime8Internal30pixel_type_to_tiff_sample_typeE
+	.data
+	.weak	_ZN6Halide7Runtime8Internal30pixel_type_to_tiff_sample_typeE
+	.p2align	1, 0x0
+_ZN6Halide7Runtime8Internal30pixel_type_to_tiff_sample_typeE:
+	.short	3                               # 0x3
+	.short	3                               # 0x3
+	.short	1                               # 0x1
+	.short	2                               # 0x2
+	.short	1                               # 0x1
+	.short	2                               # 0x2
+	.short	1                               # 0x1
+	.short	2                               # 0x2
+	.short	1                               # 0x1
+	.short	2                               # 0x2
+	.size	_ZN6Halide7Runtime8Internal30pixel_type_to_tiff_sample_typeE, 20
+
+	.type	_ZN6Halide7Runtime8Internal31pixel_type_to_matlab_class_codeE,@object # @_ZN6Halide7Runtime8Internal31pixel_type_to_matlab_class_codeE
+	.weak	_ZN6Halide7Runtime8Internal31pixel_type_to_matlab_class_codeE
+_ZN6Halide7Runtime8Internal31pixel_type_to_matlab_class_codeE:
+	.ascii	"\007\006\t\b\013\n\r\f\017\016"
+	.size	_ZN6Halide7Runtime8Internal31pixel_type_to_matlab_class_codeE, 10
+
+	.type	_ZN6Halide7Runtime8Internal30pixel_type_to_matlab_type_codeE,@object # @_ZN6Halide7Runtime8Internal30pixel_type_to_matlab_type_codeE
+	.weak	_ZN6Halide7Runtime8Internal30pixel_type_to_matlab_type_codeE
+_ZN6Halide7Runtime8Internal30pixel_type_to_matlab_type_codeE:
+	.ascii	"\007\t\002\001\004\003\006\005\r\f"
+	.size	_ZN6Halide7Runtime8Internal30pixel_type_to_matlab_type_codeE, 10
+
+	.type	.L.str.34,@object               # @.str.34
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.34:
+	.asciz	"Bounds query buffer passed to halide_debug_to_file"
+	.size	.L.str.34, 51
+
+	.type	.L.str.1.35,@object             # @.str.1.35
+.L.str.1.35:
+	.asciz	"Can't debug_to_file a Func with more than four dimensions\n"
+	.size	.L.str.1.35, 59
+
+	.type	.L.str.2.36,@object             # @.str.2.36
+.L.str.2.36:
+	.asciz	"wb"
+	.size	.L.str.2.36, 3
+
+	.type	.L.str.3.37,@object             # @.str.3.37
+.L.str.3.37:
+	.asciz	".tiff"
+	.size	.L.str.3.37, 6
+
+	.type	.L.str.4.38,@object             # @.str.4.38
+.L.str.4.38:
+	.asciz	".tif"
+	.size	.L.str.4.38, 5
+
+	.type	.L.str.5.39,@object             # @.str.5.39
+.L.str.5.39:
+	.asciz	".mat"
+	.size	.L.str.5.39, 5
+
+	.type	.L__const.halide_debug_to_file.header,@object # @__const.halide_debug_to_file.header
+	.section	.rodata,"a",@progbits
+.L__const.halide_debug_to_file.header:
+	.asciz	"MATLAB 5.0 MAT-file, produced by Halide                                                                                     \000\001IM"
+	.size	.L__const.halide_debug_to_file.header, 129
+
+	.type	.L.str.6.40,@object             # @.str.6.40
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.6.40:
+	.asciz	"Can't debug_to_file to a .mat file greater than 4GB\n"
+	.size	.L.str.6.40, 53
+
+	.type	_ZN6Halide7Runtime8Internal16memoization_lockE,@object # @_ZN6Halide7Runtime8Internal16memoization_lockE
+	.bss
+	.weak	_ZN6Halide7Runtime8Internal16memoization_lockE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal16memoization_lockE:
+	.zero	8
+	.size	_ZN6Halide7Runtime8Internal16memoization_lockE, 8
+
+	.type	_ZN6Halide7Runtime8Internal13cache_entriesE,@object # @_ZN6Halide7Runtime8Internal13cache_entriesE
+	.weak	_ZN6Halide7Runtime8Internal13cache_entriesE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal13cache_entriesE:
+	.zero	2048
+	.size	_ZN6Halide7Runtime8Internal13cache_entriesE, 2048
+
+	.type	_ZN6Halide7Runtime8Internal18most_recently_usedE,@object # @_ZN6Halide7Runtime8Internal18most_recently_usedE
+	.weak	_ZN6Halide7Runtime8Internal18most_recently_usedE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal18most_recently_usedE:
+	.quad	0
+	.size	_ZN6Halide7Runtime8Internal18most_recently_usedE, 8
+
+	.type	_ZN6Halide7Runtime8Internal19least_recently_usedE,@object # @_ZN6Halide7Runtime8Internal19least_recently_usedE
+	.weak	_ZN6Halide7Runtime8Internal19least_recently_usedE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal19least_recently_usedE:
+	.quad	0
+	.size	_ZN6Halide7Runtime8Internal19least_recently_usedE, 8
+
+	.type	_ZN6Halide7Runtime8Internal14max_cache_sizeE,@object # @_ZN6Halide7Runtime8Internal14max_cache_sizeE
+	.data
+	.weak	_ZN6Halide7Runtime8Internal14max_cache_sizeE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal14max_cache_sizeE:
+	.quad	1048576                         # 0x100000
+	.size	_ZN6Halide7Runtime8Internal14max_cache_sizeE, 8
+
+	.type	_ZN6Halide7Runtime8Internal18current_cache_sizeE,@object # @_ZN6Halide7Runtime8Internal18current_cache_sizeE
+	.bss
+	.weak	_ZN6Halide7Runtime8Internal18current_cache_sizeE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal18current_cache_sizeE:
+	.quad	0                               # 0x0
+	.size	_ZN6Halide7Runtime8Internal18current_cache_sizeE, 8
+
+	.type	.L.str.2.42,@object             # @.str.2.42
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.2.42:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/cache.cpp:284 halide_abort_if_false() failed: prev_hash_entry != nullptr\n"
+	.size	.L.str.2.42, 183
+
+	.type	.L.str.3.43,@object             # @.str.3.43
+.L.str.3.43:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/cache.cpp:373 halide_abort_if_false() failed: entry->more_recent != nullptr\n"
+	.size	.L.str.3.43, 186
+
+	.type	.L.str.4.44,@object             # @.str.4.44
+.L.str.4.44:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/cache.cpp:377 halide_abort_if_false() failed: least_recently_used == entry\n"
+	.size	.L.str.4.44, 185
+
+	.type	.L.str.5.45,@object             # @.str.5.45
+.L.str.5.45:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/cache.cpp:380 halide_abort_if_false() failed: entry->more_recent != nullptr\n"
+	.size	.L.str.5.45, 186
+
+	.type	.L.str.9.46,@object             # @.str.9.46
+.L.str.9.46:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/cache.cpp:472 halide_abort_if_false() failed: no_host_pointers_equal\n"
+	.size	.L.str.9.46, 179
+
+	.type	.L.str.12.47,@object            # @.str.12.47
+.L.str.12.47:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/cache.cpp:550 halide_abort_if_false() failed: entry->in_use_count > 0\n"
+	.size	.L.str.12.47, 180
+
+	.type	.L.str.50,@object               # @.str.50
+.L.str.50:
+	.asciz	"<nullptr>"
+	.size	.L.str.50, 10
+
+	.type	.L.str.1.57,@object             # @.str.1.57
+.L.str.1.57:
+	.asciz	"-nan"
+	.size	.L.str.1.57, 5
+
+	.type	.L.str.2.58,@object             # @.str.2.58
+.L.str.2.58:
+	.asciz	"nan"
+	.size	.L.str.2.58, 4
+
+	.type	.L.str.3.59,@object             # @.str.3.59
+.L.str.3.59:
+	.asciz	"-inf"
+	.size	.L.str.3.59, 5
+
+	.type	.L.str.4.60,@object             # @.str.4.60
+.L.str.4.60:
+	.asciz	"inf"
+	.size	.L.str.4.60, 4
+
+	.type	.L.str.5.61,@object             # @.str.5.61
+.L.str.5.61:
+	.asciz	"-0.000000e+00"
+	.size	.L.str.5.61, 14
+
+	.type	.L.str.6.62,@object             # @.str.6.62
+.L.str.6.62:
+	.asciz	"0.000000e+00"
+	.size	.L.str.6.62, 13
+
+	.type	.L.str.7.63,@object             # @.str.7.63
+.L.str.7.63:
+	.asciz	"-0.000000"
+	.size	.L.str.7.63, 10
+
+	.type	.L.str.8.64,@object             # @.str.8.64
+.L.str.8.64:
+	.asciz	"0.000000"
+	.size	.L.str.8.64, 9
+
+	.type	.L.str.9.65,@object             # @.str.9.65
+.L.str.9.65:
+	.asciz	"-"
+	.size	.L.str.9.65, 2
+
+	.type	.L.str.11.67,@object            # @.str.11.67
+.L.str.11.67:
+	.asciz	"e+"
+	.size	.L.str.11.67, 3
+
+	.type	.L.str.12.68,@object            # @.str.12.68
+.L.str.12.68:
+	.asciz	"e-"
+	.size	.L.str.12.68, 3
+
+	.type	.L.str.13.71,@object            # @.str.13.71
+.L.str.13.71:
+	.asciz	"0123456789abcdef"
+	.size	.L.str.13.71, 17
+
+	.type	.L.str.14.77,@object            # @.str.14.77
+.L.str.14.77:
+	.asciz	"int"
+	.size	.L.str.14.77, 4
+
+	.type	.L.str.15.76,@object            # @.str.15.76
+.L.str.15.76:
+	.asciz	"uint"
+	.size	.L.str.15.76, 5
+
+	.type	.L.str.16.75,@object            # @.str.16.75
+.L.str.16.75:
+	.asciz	"float"
+	.size	.L.str.16.75, 6
+
+	.type	.L.str.17.74,@object            # @.str.17.74
+.L.str.17.74:
+	.asciz	"handle"
+	.size	.L.str.17.74, 7
+
+	.type	.L.str.18.73,@object            # @.str.18.73
+.L.str.18.73:
+	.asciz	"bfloat"
+	.size	.L.str.18.73, 7
+
+	.type	.L.str.19.72,@object            # @.str.19.72
+.L.str.19.72:
+	.asciz	"bad_type_code"
+	.size	.L.str.19.72, 14
+
+	.type	.L.str.20.78,@object            # @.str.20.78
+.L.str.20.78:
+	.asciz	"x"
+	.size	.L.str.20.78, 2
+
+	.type	.L.str.21.79,@object            # @.str.21.79
+.L.str.21.79:
+	.asciz	"nullptr"
+	.size	.L.str.21.79, 8
+
+	.type	.L.str.22.80,@object            # @.str.22.80
+.L.str.22.80:
+	.asciz	" -> buffer("
+	.size	.L.str.22.80, 12
+
+	.type	.L.str.24.83,@object            # @.str.24.83
+.L.str.24.83:
+	.asciz	", {"
+	.size	.L.str.24.83, 4
+
+	.type	.L.str.25.84,@object            # @.str.25.84
+.L.str.25.84:
+	.asciz	"}"
+	.size	.L.str.25.84, 2
+
+	.type	_ZN6Halide7Runtime8Internal36halide_reuse_device_allocations_flagE,@object # @_ZN6Halide7Runtime8Internal36halide_reuse_device_allocations_flagE
+	.data
+	.weak	_ZN6Halide7Runtime8Internal36halide_reuse_device_allocations_flagE
+_ZN6Halide7Runtime8Internal36halide_reuse_device_allocations_flagE:
+	.byte	1                               # 0x1
+	.size	_ZN6Halide7Runtime8Internal36halide_reuse_device_allocations_flagE, 1
+
+	.type	_ZN6Halide7Runtime8Internal21allocation_pools_lockE,@object # @_ZN6Halide7Runtime8Internal21allocation_pools_lockE
+	.bss
+	.weak	_ZN6Halide7Runtime8Internal21allocation_pools_lockE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal21allocation_pools_lockE:
+	.zero	8
+	.size	_ZN6Halide7Runtime8Internal21allocation_pools_lockE, 8
+
+	.type	_ZN6Halide7Runtime8Internal23device_allocation_poolsE,@object # @_ZN6Halide7Runtime8Internal23device_allocation_poolsE
+	.weak	_ZN6Halide7Runtime8Internal23device_allocation_poolsE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal23device_allocation_poolsE:
+	.quad	0
+	.size	_ZN6Halide7Runtime8Internal23device_allocation_poolsE, 8
+
+	.type	_ZN6Halide7Runtime8Internal17device_copy_mutexE,@object # @_ZN6Halide7Runtime8Internal17device_copy_mutexE
+	.weak	_ZN6Halide7Runtime8Internal17device_copy_mutexE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal17device_copy_mutexE:
+	.zero	8
+	.size	_ZN6Halide7Runtime8Internal17device_copy_mutexE, 8
+
+	.type	.L.str.6.91,@object             # @.str.6.91
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.6.91:
+	.asciz	"halide_copy_to_host"
+	.size	.L.str.6.91, 20
+
+	.type	.L.str.7.92,@object             # @.str.7.92
+.L.str.7.92:
+	.asciz	"halide_copy_to_device"
+	.size	.L.str.7.92, 22
+
+	.type	.L.str.9.93,@object             # @.str.9.93
+.L.str.9.93:
+	.asciz	"halide_copy_to_device does not support switching interfaces"
+	.size	.L.str.9.93, 60
+
+	.type	.L.str.16.96,@object            # @.str.16.96
+.L.str.16.96:
+	.asciz	"halide_device_sync"
+	.size	.L.str.16.96, 19
+
+	.type	.L.str.17.94,@object            # @.str.17.94
+.L.str.17.94:
+	.asciz	"halide_device_malloc"
+	.size	.L.str.17.94, 21
+
+	.type	.L.str.20.95,@object            # @.str.20.95
+.L.str.20.95:
+	.asciz	"halide_device_malloc doesn't support switching interfaces\n"
+	.size	.L.str.20.95, 59
+
+	.type	.L.str.21.99,@object            # @.str.21.99
+.L.str.21.99:
+	.asciz	"halide_device_free"
+	.size	.L.str.21.99, 19
+
+	.type	.L.str.22.100,@object           # @.str.22.100
+.L.str.22.100:
+	.asciz	"halide_device_and_host_malloc"
+	.size	.L.str.22.100, 30
+
+	.type	.L.str.24.101,@object           # @.str.24.101
+.L.str.24.101:
+	.asciz	"halide_device_and_host_malloc doesn't support switching interfaces"
+	.size	.L.str.24.101, 67
+
+	.type	.L.str.26.102,@object           # @.str.26.102
+.L.str.26.102:
+	.asciz	"halide_device_and_host_free"
+	.size	.L.str.26.102, 28
+
+	.type	.L.str.27.103,@object           # @.str.27.103
+.L.str.27.103:
+	.asciz	"halide_default_device_and_host_malloc"
+	.size	.L.str.27.103, 38
+
+	.type	.L.str.28.104,@object           # @.str.28.104
+.L.str.28.104:
+	.asciz	"halide_default_device_and_host_free"
+	.size	.L.str.28.104, 36
+
+	.type	.L.str.29.105,@object           # @.str.29.105
+.L.str.29.105:
+	.asciz	"halide_device_wrap_native"
+	.size	.L.str.29.105, 26
+
+	.type	.L.str.30.106,@object           # @.str.30.106
+.L.str.30.106:
+	.asciz	"halide_device_wrap_native doesn't support switching interfaces"
+	.size	.L.str.30.106, 63
+
+	.type	.L.str.31.107,@object           # @.str.31.107
+.L.str.31.107:
+	.asciz	"halide_device_detach_native"
+	.size	.L.str.31.107, 28
+
+	.type	.L.str.32.108,@object           # @.str.32.108
+.L.str.32.108:
+	.asciz	"buf->device == 0 in halide_device_detach_native() after detach_native()\n"
+	.size	.L.str.32.108, 73
+
+	.type	.L.str.34.109,@object           # @.str.34.109
+.L.str.34.109:
+	.asciz	"halide_default_device_detach_native"
+	.size	.L.str.34.109, 36
+
+	.type	.L.str.40,@object               # @.str.40
+.L.str.40:
+	.asciz	"halide_buffer_copy does not support switching device interfaces"
+	.size	.L.str.40, 64
+
+	.type	.L.str.53,@object               # @.str.53
+.L.str.53:
+	.asciz	"Failure in halide_buffer_copy_already_locked"
+	.size	.L.str.53, 45
+
+	.type	.L.str.58,@object               # @.str.58
+.L.str.58:
+	.asciz	"device_interface does not support cropping"
+	.size	.L.str.58, 43
+
+	.type	.L.str.59,@object               # @.str.59
+.L.str.59:
+	.asciz	"device_interface does not support slicing"
+	.size	.L.str.59, 42
+
+	.type	.L.str.60,@object               # @.str.60
+.L.str.60:
+	.asciz	"destination buffer already has a device allocation"
+	.size	.L.str.60, 51
+
+	.type	.L.str.61,@object               # @.str.61
+.L.str.61:
+	.asciz	"src and dst must have identical dimensionality"
+	.size	.L.str.61, 47
+
+	.type	.L.str.64,@object               # @.str.64
+.L.str.64:
+	.asciz	"dst must have exactly one fewer dimension than src"
+	.size	.L.str.64, 51
+
+	.type	.L.str.112,@object              # @.str.112
+.L.str.112:
+	.asciz	"Bounds inference call to external stage "
+	.size	.L.str.112, 41
+
+	.type	.L.str.1.113,@object            # @.str.1.113
+.L.str.1.113:
+	.asciz	" returned non-zero value: "
+	.size	.L.str.1.113, 27
+
+	.type	.L.str.2.114,@object            # @.str.2.114
+.L.str.2.114:
+	.asciz	"Call to external stage "
+	.size	.L.str.2.114, 24
+
+	.type	.L.str.3.115,@object            # @.str.3.115
+.L.str.3.115:
+	.asciz	"Bounds given for "
+	.size	.L.str.3.115, 18
+
+	.type	.L.str.4.116,@object            # @.str.4.116
+.L.str.4.116:
+	.asciz	" in "
+	.size	.L.str.4.116, 5
+
+	.type	.L.str.5.117,@object            # @.str.5.117
+.L.str.5.117:
+	.asciz	" (from "
+	.size	.L.str.5.117, 8
+
+	.type	.L.str.6.118,@object            # @.str.6.118
+.L.str.6.118:
+	.asciz	" to "
+	.size	.L.str.6.118, 5
+
+	.type	.L.str.7.119,@object            # @.str.7.119
+.L.str.7.119:
+	.asciz	") do not cover required region (from "
+	.size	.L.str.7.119, 38
+
+	.type	.L.str.8.120,@object            # @.str.8.120
+.L.str.8.120:
+	.asciz	")"
+	.size	.L.str.8.120, 2
+
+	.type	.L.str.9.121,@object            # @.str.9.121
+.L.str.9.121:
+	.asciz	" has type "
+	.size	.L.str.9.121, 11
+
+	.type	.L.str.10.122,@object           # @.str.10.122
+.L.str.10.122:
+	.asciz	" but type of the buffer passed in is "
+	.size	.L.str.10.122, 38
+
+	.type	.L.str.11.123,@object           # @.str.11.123
+.L.str.11.123:
+	.asciz	" requires a buffer of exactly "
+	.size	.L.str.11.123, 31
+
+	.type	.L.str.12.124,@object           # @.str.12.124
+.L.str.12.124:
+	.asciz	" dimensions, but the buffer passed in has "
+	.size	.L.str.12.124, 43
+
+	.type	.L.str.13.125,@object           # @.str.13.125
+.L.str.13.125:
+	.asciz	" dimensions"
+	.size	.L.str.13.125, 12
+
+	.type	.L.str.14.126,@object           # @.str.14.126
+.L.str.14.126:
+	.asciz	" is accessed at "
+	.size	.L.str.14.126, 17
+
+	.type	.L.str.15.127,@object           # @.str.15.127
+.L.str.15.127:
+	.asciz	", which is before the min ("
+	.size	.L.str.15.127, 28
+
+	.type	.L.str.16.128,@object           # @.str.16.128
+.L.str.16.128:
+	.asciz	") in dimension "
+	.size	.L.str.16.128, 16
+
+	.type	.L.str.17.129,@object           # @.str.17.129
+.L.str.17.129:
+	.asciz	", which is beyond the max ("
+	.size	.L.str.17.129, 28
+
+	.type	.L.str.18.130,@object           # @.str.18.130
+.L.str.18.130:
+	.asciz	"Total allocation for buffer "
+	.size	.L.str.18.130, 29
+
+	.type	.L.str.19.131,@object           # @.str.19.131
+.L.str.19.131:
+	.asciz	" is "
+	.size	.L.str.19.131, 5
+
+	.type	.L.str.20.132,@object           # @.str.20.132
+.L.str.20.132:
+	.asciz	", which exceeds the maximum size of "
+	.size	.L.str.20.132, 37
+
+	.type	.L.str.21.133,@object           # @.str.21.133
+.L.str.21.133:
+	.asciz	"The extents for buffer "
+	.size	.L.str.21.133, 24
+
+	.type	.L.str.22.134,@object           # @.str.22.134
+.L.str.22.134:
+	.asciz	" dimension "
+	.size	.L.str.22.134, 12
+
+	.type	.L.str.23.135,@object           # @.str.23.135
+.L.str.23.135:
+	.asciz	" is negative ("
+	.size	.L.str.23.135, 15
+
+	.type	.L.str.24.136,@object           # @.str.24.136
+.L.str.24.136:
+	.asciz	"Product of extents for buffer "
+	.size	.L.str.24.136, 31
+
+	.type	.L.str.25.137,@object           # @.str.25.137
+.L.str.25.137:
+	.asciz	"Applying the constraints on "
+	.size	.L.str.25.137, 29
+
+	.type	.L.str.26.138,@object           # @.str.26.138
+.L.str.26.138:
+	.asciz	" to the required region made it smaller in dimension "
+	.size	.L.str.26.138, 54
+
+	.type	.L.str.27.139,@object           # @.str.27.139
+.L.str.27.139:
+	.asciz	". "
+	.size	.L.str.27.139, 3
+
+	.type	.L.str.28.140,@object           # @.str.28.140
+.L.str.28.140:
+	.asciz	"Required size: "
+	.size	.L.str.28.140, 16
+
+	.type	.L.str.29.141,@object           # @.str.29.141
+.L.str.29.141:
+	.asciz	"Constrained size: "
+	.size	.L.str.29.141, 19
+
+	.type	.L.str.30.142,@object           # @.str.30.142
+.L.str.30.142:
+	.asciz	"."
+	.size	.L.str.30.142, 2
+
+	.type	.L.str.31.143,@object           # @.str.31.143
+.L.str.31.143:
+	.asciz	"Constraint violated: "
+	.size	.L.str.31.143, 22
+
+	.type	.L.str.32.144,@object           # @.str.32.144
+.L.str.32.144:
+	.asciz	" ("
+	.size	.L.str.32.144, 3
+
+	.type	.L.str.33.145,@object           # @.str.33.145
+.L.str.33.145:
+	.asciz	") == "
+	.size	.L.str.33.145, 6
+
+	.type	.L.str.34.146,@object           # @.str.34.146
+.L.str.34.146:
+	.asciz	"Parameter "
+	.size	.L.str.34.146, 11
+
+	.type	.L.str.35,@object               # @.str.35
+.L.str.35:
+	.asciz	" but must be at least "
+	.size	.L.str.35, 23
+
+	.type	.L.str.36,@object               # @.str.36
+.L.str.36:
+	.asciz	" but must be at most "
+	.size	.L.str.36, 22
+
+	.type	.L.str.37,@object               # @.str.37
+.L.str.37:
+	.asciz	"Out of memory (halide_malloc returned nullptr)"
+	.size	.L.str.37, 47
+
+	.type	.L.str.38,@object               # @.str.38
+.L.str.38:
+	.asciz	"Buffer argument "
+	.size	.L.str.38, 17
+
+	.type	.L.str.39,@object               # @.str.39
+.L.str.39:
+	.asciz	" is nullptr"
+	.size	.L.str.39, 12
+
+	.type	.L.str.40.147,@object           # @.str.40.147
+.L.str.40.147:
+	.asciz	"Failed to dump function "
+	.size	.L.str.40.147, 25
+
+	.type	.L.str.41,@object               # @.str.41
+.L.str.41:
+	.asciz	" to file "
+	.size	.L.str.41, 10
+
+	.type	.L.str.42,@object               # @.str.42
+.L.str.42:
+	.asciz	" with error "
+	.size	.L.str.42, 13
+
+	.type	.L.str.43,@object               # @.str.43
+.L.str.43:
+	.asciz	"The host pointer of "
+	.size	.L.str.43, 21
+
+	.type	.L.str.44,@object               # @.str.44
+.L.str.44:
+	.asciz	" is not aligned to a "
+	.size	.L.str.44, 22
+
+	.type	.L.str.45,@object               # @.str.45
+.L.str.45:
+	.asciz	" bytes boundary."
+	.size	.L.str.45, 17
+
+	.type	.L.str.46,@object               # @.str.46
+.L.str.46:
+	.asciz	"The buffer "
+	.size	.L.str.46, 12
+
+	.type	.L.str.47,@object               # @.str.47
+.L.str.47:
+	.asciz	" is dirty on device, but this pipeline was compiled "
+	.size	.L.str.47, 53
+
+	.type	.L.str.48,@object               # @.str.48
+.L.str.48:
+	.asciz	"with no support for device to host copies."
+	.size	.L.str.48, 43
+
+	.type	.L.str.49,@object               # @.str.49
+.L.str.49:
+	.asciz	" is null, but the pipeline will access it on the host."
+	.size	.L.str.49, 55
+
+	.type	.L.str.50.148,@object           # @.str.50.148
+.L.str.50.148:
+	.asciz	"The folded storage dimension "
+	.size	.L.str.50.148, 30
+
+	.type	.L.str.51,@object               # @.str.51
+.L.str.51:
+	.asciz	" of "
+	.size	.L.str.51, 5
+
+	.type	.L.str.52,@object               # @.str.52
+.L.str.52:
+	.asciz	" was accessed out of order by loop "
+	.size	.L.str.52, 36
+
+	.type	.L.str.53.149,@object           # @.str.53.149
+.L.str.53.149:
+	.asciz	"Cannot fold dimension "
+	.size	.L.str.53.149, 23
+
+	.type	.L.str.54,@object               # @.str.54
+.L.str.54:
+	.asciz	" because an extern stage accesses ["
+	.size	.L.str.54, 36
+
+	.type	.L.str.55,@object               # @.str.55
+.L.str.55:
+	.asciz	", "
+	.size	.L.str.55, 3
+
+	.type	.L.str.56,@object               # @.str.56
+.L.str.56:
+	.asciz	"],"
+	.size	.L.str.56, 3
+
+	.type	.L.str.57,@object               # @.str.57
+.L.str.57:
+	.asciz	" which is outside the range currently valid: ["
+	.size	.L.str.57, 47
+
+	.type	.L.str.58.150,@object           # @.str.58.150
+.L.str.58.150:
+	.asciz	"]."
+	.size	.L.str.58.150, 3
+
+	.type	.L.str.59.151,@object           # @.str.59.151
+.L.str.59.151:
+	.asciz	" which wraps around the boundary of the fold, "
+	.size	.L.str.59.151, 47
+
+	.type	.L.str.60.152,@object           # @.str.60.152
+.L.str.60.152:
+	.asciz	"which occurs at multiples of "
+	.size	.L.str.60.152, 30
+
+	.type	.L.str.61.153,@object           # @.str.61.153
+.L.str.61.153:
+	.asciz	"The fold factor ("
+	.size	.L.str.61.153, 18
+
+	.type	.L.str.62,@object               # @.str.62
+.L.str.62:
+	.asciz	") of dimension "
+	.size	.L.str.62, 16
+
+	.type	.L.str.63,@object               # @.str.63
+.L.str.63:
+	.asciz	" is too small to store the required region accessed by loop "
+	.size	.L.str.63, 61
+
+	.type	.L.str.64.154,@object           # @.str.64.154
+.L.str.64.154:
+	.asciz	")."
+	.size	.L.str.64.154, 3
+
+	.type	.L.str.65,@object               # @.str.65
+.L.str.65:
+	.asciz	"Requirement Failed: ("
+	.size	.L.str.65, 22
+
+	.type	.L.str.66,@object               # @.str.66
+.L.str.66:
+	.asciz	") "
+	.size	.L.str.66, 3
+
+	.type	.L.str.67,@object               # @.str.67
+.L.str.67:
+	.asciz	"A schedule specialized with specialize_fail() was chosen: "
+	.size	.L.str.67, 59
+
+	.type	.L.str.68,@object               # @.str.68
+.L.str.68:
+	.asciz	"Buffer has a non-zero device but no device interface.\n"
+	.size	.L.str.68, 55
+
+	.type	.L.str.69.159,@object           # @.str.69.159
+.L.str.69.159:
+	.asciz	"Buffer has a non-null device_interface but device is 0.\n"
+	.size	.L.str.69.159, 57
+
+	.type	.L.str.70,@object               # @.str.70
+.L.str.70:
+	.asciz	"Buffer has both host and device dirty bits set.\n"
+	.size	.L.str.70, 49
+
+	.type	.L.str.71,@object               # @.str.71
+.L.str.71:
+	.asciz	"Buffer pointer passed to "
+	.size	.L.str.71, 26
+
+	.type	.L.str.72,@object               # @.str.72
+.L.str.72:
+	.asciz	" is null.\n"
+	.size	.L.str.72, 11
+
+	.type	.L.str.73,@object               # @.str.73
+.L.str.73:
+	.asciz	"The explicit allocation bound ("
+	.size	.L.str.73, 32
+
+	.type	.L.str.74,@object               # @.str.74
+.L.str.74:
+	.asciz	" is too small to store the required region ("
+	.size	.L.str.74, 45
+
+	.type	.L.str.75,@object               # @.str.75
+.L.str.75:
+	.asciz	"Buffer could not be cropped (runtime error or unimplemented device option).\n"
+	.size	.L.str.75, 77
+
+	.type	_ZZ25halide_profiler_get_stateE1s,@object # @_ZZ25halide_profiler_get_stateE1s
+	.data
+	.p2align	3, 0x0
+_ZZ25halide_profiler_get_stateE1s:
+	.zero	8
+	.long	1                               # 0x1
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.quad	0
+	.quad	0
+	.quad	0
+	.size	_ZZ25halide_profiler_get_stateE1s, 48
+
+	.type	.L.str.188,@object              # @.str.188
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.188:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/profiler_common.cpp:247 halide_abort_if_false() failed: p_stats != nullptr\n"
+	.size	.L.str.188, 185
+
+	.type	.L.str.1.189,@object            # @.str.1.189
+.L.str.1.189:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/profiler_common.cpp:276 halide_abort_if_false() failed: p_stats != nullptr\n"
+	.size	.L.str.1.189, 185
+
+	.type	.L.str.2.190,@object            # @.str.2.190
+.L.str.2.190:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/profiler_common.cpp:277 halide_abort_if_false() failed: func_id >= 0\n"
+	.size	.L.str.2.190, 179
+
+	.type	.L.str.3.191,@object            # @.str.3.191
+.L.str.3.191:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/profiler_common.cpp:278 halide_abort_if_false() failed: func_id < p_stats->num_funcs\n"
+	.size	.L.str.3.191, 195
+
+	.type	.L.str.4.192,@object            # @.str.4.192
+.L.str.4.192:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/profiler_common.cpp:314 halide_abort_if_false() failed: p_stats != nullptr\n"
+	.size	.L.str.4.192, 185
+
+	.type	.L.str.5.193,@object            # @.str.5.193
+.L.str.5.193:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/profiler_common.cpp:315 halide_abort_if_false() failed: func_id >= 0\n"
+	.size	.L.str.5.193, 179
+
+	.type	.L.str.6.194,@object            # @.str.6.194
+.L.str.6.194:
+	.asciz	"/home/halidenightly/build_bot/worker/halide-nightly-main-llvm16-x86-64-linux-cmake/halide-source/src/runtime/profiler_common.cpp:316 halide_abort_if_false() failed: func_id < p_stats->num_funcs\n"
+	.size	.L.str.6.194, 195
+
+	.type	.L.str.7.166,@object            # @.str.7.166
+.L.str.7.166:
+	.asciz	"\n"
+	.size	.L.str.7.166, 2
+
+	.type	.L.str.8.167,@object            # @.str.8.167
+.L.str.8.167:
+	.asciz	" total time: "
+	.size	.L.str.8.167, 14
+
+	.type	.L.str.9.168,@object            # @.str.9.168
+.L.str.9.168:
+	.asciz	" ms"
+	.size	.L.str.9.168, 4
+
+	.type	.L.str.10.169,@object           # @.str.10.169
+.L.str.10.169:
+	.asciz	"  samples: "
+	.size	.L.str.10.169, 12
+
+	.type	.L.str.11.170,@object           # @.str.11.170
+.L.str.11.170:
+	.asciz	"  runs: "
+	.size	.L.str.11.170, 9
+
+	.type	.L.str.12.171,@object           # @.str.12.171
+.L.str.12.171:
+	.asciz	"  time/run: "
+	.size	.L.str.12.171, 13
+
+	.type	.L.str.13.172,@object           # @.str.13.172
+.L.str.13.172:
+	.asciz	" ms\n"
+	.size	.L.str.13.172, 5
+
+	.type	.L.str.14.173,@object           # @.str.14.173
+.L.str.14.173:
+	.asciz	" average threads used: "
+	.size	.L.str.14.173, 24
+
+	.type	.L.str.15.174,@object           # @.str.15.174
+.L.str.15.174:
+	.asciz	" heap allocations: "
+	.size	.L.str.15.174, 20
+
+	.type	.L.str.16.175,@object           # @.str.16.175
+.L.str.16.175:
+	.asciz	"  peak heap usage: "
+	.size	.L.str.16.175, 20
+
+	.type	.L.str.17.176,@object           # @.str.17.176
+.L.str.17.176:
+	.asciz	" bytes\n"
+	.size	.L.str.17.176, 8
+
+	.type	.L.str.18.177,@object           # @.str.18.177
+.L.str.18.177:
+	.asciz	"  "
+	.size	.L.str.18.177, 3
+
+	.type	.L.str.19.178,@object           # @.str.19.178
+.L.str.19.178:
+	.asciz	": "
+	.size	.L.str.19.178, 3
+
+	.type	.L.str.20.179,@object           # @.str.20.179
+.L.str.20.179:
+	.asciz	" "
+	.size	.L.str.20.179, 2
+
+	.type	.L.str.21.180,@object           # @.str.21.180
+.L.str.21.180:
+	.asciz	"ms"
+	.size	.L.str.21.180, 3
+
+	.type	.L.str.22.181,@object           # @.str.22.181
+.L.str.22.181:
+	.asciz	"("
+	.size	.L.str.22.181, 2
+
+	.type	.L.str.23.182,@object           # @.str.23.182
+.L.str.23.182:
+	.asciz	"%)"
+	.size	.L.str.23.182, 3
+
+	.type	.L.str.24.183,@object           # @.str.24.183
+.L.str.24.183:
+	.asciz	"threads: "
+	.size	.L.str.24.183, 10
+
+	.type	.L.str.25.184,@object           # @.str.25.184
+.L.str.25.184:
+	.asciz	" peak: "
+	.size	.L.str.25.184, 8
+
+	.type	.L.str.26.185,@object           # @.str.26.185
+.L.str.26.185:
+	.asciz	" num: "
+	.size	.L.str.26.185, 7
+
+	.type	.L.str.27.186,@object           # @.str.27.186
+.L.str.27.186:
+	.asciz	" avg: "
+	.size	.L.str.27.186, 7
+
+	.type	.L.str.28.187,@object           # @.str.28.187
+.L.str.28.187:
+	.asciz	" stack: "
+	.size	.L.str.28.187, 9
+
+	.type	.L.str.29.165,@object           # @.str.29.165
+.L.str.29.165:
+	.asciz	"Printer buffer allocation failed.\n"
+	.size	.L.str.29.165, 35
+
+	.type	_ZN6Halide7Runtime8Internal30custom_can_use_target_featuresE,@object # @_ZN6Halide7Runtime8Internal30custom_can_use_target_featuresE
+	.data
+	.weak	_ZN6Halide7Runtime8Internal30custom_can_use_target_featuresE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal30custom_can_use_target_featuresE:
+	.quad	halide_default_can_use_target_features
+	.size	_ZN6Halide7Runtime8Internal30custom_can_use_target_featuresE, 8
+
+	.type	_ZN6Halide7Runtime8Internal27halide_cpu_features_storageE,@object # @_ZN6Halide7Runtime8Internal27halide_cpu_features_storageE
+	.bss
+	.weak	_ZN6Halide7Runtime8Internal27halide_cpu_features_storageE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal27halide_cpu_features_storageE:
+	.zero	32
+	.size	_ZN6Halide7Runtime8Internal27halide_cpu_features_storageE, 32
+
+	.type	_ZN6Halide7Runtime8Internal31halide_cpu_features_initializedE,@object # @_ZN6Halide7Runtime8Internal31halide_cpu_features_initializedE
+	.weak	_ZN6Halide7Runtime8Internal31halide_cpu_features_initializedE
+_ZN6Halide7Runtime8Internal31halide_cpu_features_initializedE:
+	.byte	0                               # 0x0
+	.size	_ZN6Halide7Runtime8Internal31halide_cpu_features_initializedE, 1
+
+	.type	_ZN6Halide7Runtime8Internal36halide_cpu_features_initialized_lockE,@object # @_ZN6Halide7Runtime8Internal36halide_cpu_features_initialized_lockE
+	.weak	_ZN6Halide7Runtime8Internal36halide_cpu_features_initialized_lockE
+	.p2align	3, 0x0
+_ZN6Halide7Runtime8Internal36halide_cpu_features_initialized_lockE:
+	.zero	8
+	.size	_ZN6Halide7Runtime8Internal36halide_cpu_features_initialized_lockE, 8
+
+	.type	.L.str.199,@object              # @.str.199
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str.199:
+	.asciz	"Internal error: wrong structure size passed to halide_can_use_target_features()\n"
+	.size	.L.str.199, 81
+
+	.type	.L__unnamed_1,@object           # @0
+	.section	.rodata,"a",@progbits
+	.p2align	4, 0x0
+.L__unnamed_1:
+	.zero	32
+	.size	.L__unnamed_1, 32
+
+	.type	.Lstr,@object                   # @str
+	.p2align	5, 0x0
+.Lstr:
+	.asciz	"conv_y"
+	.size	.Lstr, 7
+
+	.type	.L__unnamed_2,@object           # @1
+	.section	.data.rel.ro,"aw",@progbits
+	.p2align	4, 0x0
+.L__unnamed_2:
+	.quad	.Lstr
+	.long	2                               # 0x2
+	.long	2                               # 0x2
+	.byte	2                               # 0x2
+	.byte	32                              # 0x20
+	.short	1                               # 0x1
+	.zero	4
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	.L__unnamed_1
+	.size	.L__unnamed_2, 64
+
+	.type	.Lstr.203,@object               # @str.203
+	.section	.rodata,"a",@progbits
+	.p2align	5, 0x0
+.Lstr.203:
+	.asciz	"x86-64-linux-avx-avx2-f16c-fma-sse41"
+	.size	.Lstr.203, 37
+
+	.type	.Lconv_y_metadata_storage,@object # @conv_y_metadata_storage
+	.section	.data.rel.ro,"aw",@progbits
+	.p2align	4, 0x0
+.Lconv_y_metadata_storage:
+	.long	1                               # 0x1
+	.long	1                               # 0x1
+	.quad	.L__unnamed_2
+	.quad	.Lstr.203
+	.quad	.Lstr
+	.size	.Lconv_y_metadata_storage, 32
+
+	.type	.Lstr.204,@object               # @str.204
+	.section	.rodata,"a",@progbits
+	.p2align	5, 0x0
+.Lstr.204:
+	.asciz	"Output buffer conv_y"
+	.size	.Lstr.204, 21
+
+	.type	.Lstr.205,@object               # @str.205
+	.p2align	5, 0x0
+.Lstr.205:
+	.asciz	"conv_y.stride.0"
+	.size	.Lstr.205, 16
+
+	.type	.Lstr.206,@object               # @str.206
+	.p2align	5, 0x0
+.Lstr.206:
+	.asciz	"1"
+	.size	.Lstr.206, 2
+
+	.type	.Lreltable.halide_type_to_string,@object # @reltable.halide_type_to_string
+	.p2align	2, 0x0
+.Lreltable.halide_type_to_string:
+	.long	.L.str.14.77-.Lreltable.halide_type_to_string
+	.long	.L.str.15.76-.Lreltable.halide_type_to_string
+	.long	.L.str.16.75-.Lreltable.halide_type_to_string
+	.long	.L.str.17.74-.Lreltable.halide_type_to_string
+	.long	.L.str.18.73-.Lreltable.halide_type_to_string
+	.size	.Lreltable.halide_type_to_string, 20
+
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.ident	"clang version 16.0.1 (https://github.com/llvm/llvm-project.git cd89023f797900e4492da58b7bed36f702120011)"
+	.section	".note.GNU-stack","",@progbits
+</pre>
+</div>
+</div>
+</div>
+  </div>
+</body><script>
+    /* Highlighting 'matched' elements in IR code */
+    $('#ir-code-tab .matched').each(function () {
+        this.onmouseover = function () {
+            $('#ir-code-tab .matched[id^=' + this.id.split('-')[0] + '-]').addClass('Highlight');
+        }
+        this.onmouseout = function () {
+            $('#ir-code-tab .matched[id^=' + this.id.split('-')[0] + '-]').removeClass('Highlight');
+        }
+    });
+
+    /* Highlighting 'matched' elements in Viz code */
+    $('#ir-visualization-tab .matched').each(function () {
+        this.onmouseover = function () {
+            $('#ir-visualization-tab .matched[id^=' + this.id.split('-')[0] + '-]').addClass('Highlight');
+        }
+        this.onmouseout = function () {
+            $('#ir-visualization-tab .matched[id^=' + this.id.split('-')[0] + '-]').removeClass('Highlight');
+        }
+    });
+
+    /* Cross highlighting 'matched' variables (only) */
+    $('#ir-visualization-tab .matched.variable').each(function () {
+        this.onmouseover = function () {
+            var name = this.outerText;
+            $('.matched.variable').filter((idx, val) => {
+                return val.outerText === name;
+            }).addClass('Highlight');
+        }
+        this.onmouseout = function () {
+            var name = this.outerText;
+            $('.matched.variable').filter((idx, val) => {
+                return val.outerText === name;
+            }).removeClass('Highlight');
+        }
+    });
+
+    /* Expand/Collapse buttons in IR code */
+    function toggle(id) {
+        e = document.getElementById(id);
+        e_cb = document.getElementsByClassName("cb-" + id)[0];
+        show = document.getElementById(id + '-show');
+        hide = document.getElementById(id + '-hide');
+        ccost_btn = document.getElementById("cc-" + id);
+        dcost_btn = document.getElementById("dc-" + id);
+        ccost_tt = document.getElementById("tooltip-cc-" + id);
+        dcost_tt = document.getElementById("tooltip-dc-" + id);
+        if (e.classList.contains("collapsed-block")) {
+            e.classList.remove("collapsed-block");
+            e_cb.classList.add("ClosingBrace");
+            show.style.display = 'none';
+            hide.style.display = 'block';
+            if (ccost_btn && dcost_tt) {
+                // Update cost indicators
+                ccost_color = ccost_btn.getAttribute('line-cost-color');
+                dcost_color = dcost_btn.getAttribute('line-cost-color');
+                ccost_btn.className = ccost_btn.className.replace(/CostColor\d+/, 'CostColor' + ccost_color);
+                dcost_btn.className = dcost_btn.className.replace(/CostColor\d+/, 'CostColor' + dcost_color);
+                // Update cost tooltips
+                ccost = ccost_btn.getAttribute('line-cost');
+                dcost = dcost_btn.getAttribute('line-cost');
+                ccost_tt.innerText = 'Op Count: ' + ccost;
+                dcost_tt.innerText = 'Bits Moved: ' + dcost;
+            }
+        } else {
+            e.classList.add("collapsed-block");
+            e_cb.classList.remove("ClosingBrace");
+            show.style.display = 'block';
+            hide.style.display = 'none';
+            if (ccost_btn && dcost_tt) {
+                // Update cost indicators
+                collapsed_ccost_color = ccost_btn.getAttribute('block-cost-color');
+                collapsed_dcost_color = dcost_btn.getAttribute('block-cost-color');
+                ccost_btn.className = ccost_btn.className.replace(/CostColor\d+/, 'CostColor' + collapsed_ccost_color);
+                dcost_btn.className = dcost_btn.className.replace(/CostColor\d+/, 'CostColor' + collapsed_dcost_color);
+                // Update cost tooltips
+                collapsed_ccost = ccost_btn.getAttribute('block-cost');
+                collapsed_dcost = dcost_btn.getAttribute('block-cost');
+                ccost_tt.innerText = 'Op Count: ' + collapsed_ccost;
+                dcost_tt.innerText = 'Bits Moved: ' + collapsed_dcost;
+            }
+        }
+        return false;
+    }
+
+    /* Expand/Collapse buttons in Viz */
+    function toggleViz(id) {
+        var buttonShow = document.getElementById(id + '-show');
+        var buttonHide = document.getElementById(id + '-hide');
+        var body = document.getElementById(id);
+        var re = /(?:\-([^-]+))?$/;
+        var ccost_btn = document.getElementById("vcc-" + re.exec(id)[1]);
+        var dcost_btn = document.getElementById("vdc-" + re.exec(id)[1]);
+        var ccost_tt = document.getElementById("tooltip-vcc-" + re.exec(id)[1]);
+        var dcost_tt = document.getElementById("tooltip-vdc-" + re.exec(id)[1]);
+        if (body.classList.contains("collapsed-viz")) {
+            body.classList.remove("collapsed-viz");
+            buttonShow.style.display = 'none';
+            buttonHide.style.display = 'block';
+            if (ccost_btn && dcost_tt) {
+                // Update cost indicators
+                ccost_color = ccost_btn.getAttribute('line-cost-color');
+                dcost_color = dcost_btn.getAttribute('line-cost-color');
+                ccost_btn.className = ccost_btn.className.replace(/CostColor\d+/, 'CostColor' + ccost_color);
+                dcost_btn.className = dcost_btn.className.replace(/CostColor\d+/, 'CostColor' + dcost_color);
+                // Update cost tooltips
+                ccost = ccost_btn.getAttribute('line-cost');
+                dcost = dcost_btn.getAttribute('line-cost');
+                ccost_tt.innerText = 'Op Count: ' + ccost;
+                dcost_tt.innerText = 'Bits Moved: ' + dcost;
+            }
+        } else {
+            body.classList.add("collapsed-viz");
+            buttonShow.style.display = 'block';
+            buttonHide.style.display = 'none';
+            if (ccost_btn && dcost_tt) {
+                // Update cost indicators
+                collapsed_ccost_color = ccost_btn.getAttribute('block-cost-color');
+                collapsed_dcost_color = dcost_btn.getAttribute('block-cost-color');
+                ccost_btn.className = ccost_btn.className.replace(/CostColor\d+/, 'CostColor' + collapsed_ccost_color);
+                dcost_btn.className = dcost_btn.className.replace(/CostColor\d+/, 'CostColor' + collapsed_dcost_color);
+                // Update cost tooltips
+                collapsed_ccost = ccost_btn.getAttribute('block-cost');
+                collapsed_dcost = dcost_btn.getAttribute('block-cost');
+                ccost_tt.innerText = 'Op Count: ' + collapsed_ccost;
+                dcost_tt.innerText = 'Bits Moved: ' + collapsed_dcost;
+            }
+        }
+    };
+
+    /* Scroll to visualization from IR code */
+    function scrollToViz(id) {
+        var container = document.getElementById('ir-visualization-tab');
+        var scrollToObject = document.getElementById(id);
+        makeVizVisible(scrollToObject);
+        container.scrollTo({
+            top: getOffsetTop(scrollToObject) - 8,
+            left: getOffsetLeft(scrollToObject),
+            behavior: 'smooth'
+        });
+        scrollToObject.style.backgroundColor = 'white';
+        setTimeout(function () {
+            scrollToObject.style.backgroundColor = 'transparent';
+        }, 1000);
+    }
+
+    function getOffsetTop(element) {
+        if (!element) return 0;
+        if (element.id == 'ir-visualization-tab') return 0;
+        return getOffsetTop(element.offsetParent) + element.offsetTop;
+    }
+
+    function getOffsetLeft(element) {
+        if (!element) return 0;
+        if (element.id == 'ir-visualization-tab') return 0;
+        return getOffsetLeft(element.offsetParent) + element.offsetLeft;
+    }
+
+    // In case the code we are scrolling to viz block that sits within
+    // a collapsed parent block, uncollapse it
+    function makeVizVisible(element) {
+        if (!element) return;
+        if (element == document) return;
+        if (element.classList.contains("collapsed-viz")) {
+            toggleViz(element.id);
+        }
+        makeVizVisible(element.parentNode);
+    }
+
+    /* Scroll to code from visualization */
+    function scrollToCode(id) {
+        var container = document.getElementById('ir-code-tab');
+        var scrollToObject = document.getElementById(id);
+        makeCodeVisible(scrollToObject);
+        container.scrollTo({
+            top: scrollToObject.offsetTop,
+            behavior: 'smooth'
+        });
+        scrollToObject.style.backgroundColor = 'lightgray';
+        setTimeout(function () {
+            scrollToObject.style.backgroundColor = 'transparent';
+        }, 1000);
+    }
+
+    // In case the code we are scrolling to code that sits within
+    // a collapsed code block, uncollapse it
+    function makeCodeVisible(element) {
+        if (!element) return;
+        if (element == document) return;
+        if (element.classList.contains("collapsed-block")) {
+            toggle(element.id);
+        }
+        makeCodeVisible(element.parentNode);
+    }
+
+    /* Resizing visualization tabs */
+    var codeDiv = document.getElementById('ir-code-tab');
+    var resizeBar = document.getElementById('resize-bar-1');
+    var irVizDiv = document.getElementById('ir-visualization-tab');
+    var resizeBarAssembly = document.getElementById('resize-bar-2');
+    var assemblyCodeDiv = document.getElementById('assembly-tab');
+
+    codeDiv.style.flexGrow = '0';
+    resizeBar.style.flexGrow = '0';
+    irVizDiv.style.flexGrow = '0';
+    resizeBarAssembly.style.flexGrow = '0';
+    assemblyCodeDiv.style.flexGrow = '0';
+
+    codeDiv.style.flexBasis = 'calc(50% - 6px)';
+    resizeBar.style.flexBasis = '6px';
+    irVizDiv.style.flexBasis = 'calc(50% - 3px)';
+    resizeBarAssembly.style.flexBasis = '6px';
+
+    resizeBar.addEventListener('mousedown', (event) => {
+        document.addEventListener('mousemove', resize, false);
+        document.addEventListener('mouseup', () => {
+            document.removeEventListener('mousemove', resize, false);
+        }, false);
+    });
+
+    resizeBarAssembly.addEventListener('mousedown', (event) => {
+        document.addEventListener('mousemove', resizeAssembly, false);
+        document.addEventListener('mouseup', () => {
+            document.removeEventListener('mousemove', resizeAssembly, false);
+        }, false);
+    });
+
+    function resize(e) {
+        if (e.x < 25) {
+            collapse_code_tab();
+            return;
+        }
+
+        const size = `${e.x}px`;
+        var rect = resizeBarAssembly.getBoundingClientRect();
+
+        if (e.x > rect.left) {
+            collapseR_visualization_tab();
+            return;
+        }
+
+        codeDiv.style.display = 'block';
+        irVizDiv.style.display = 'block';
+        codeDiv.style.flexBasis = size;
+        irVizDiv.style.flexBasis = `calc(${rect.left}px - ${size})`;
+    }
+
+    function resizeAssembly(e) {
+        if (e.x > screen.width - 25) {
+            collapse_assembly_tab();
+            return;
+        }
+
+        var rect = resizeBar.getBoundingClientRect();
+
+        if (e.x < rect.right) {
+            collapseL_visualization_tab();
+            return;
+        }
+
+        const size = `${e.x}px`;
+        irVizDiv.style.display = 'block';
+        assemblyCodeDiv.style.display = 'block';
+        irVizDiv.style.flexBasis = `calc(${size} - ${rect.right}px)`;
+        assemblyCodeDiv.style.flexBasis = `calc(100% - ${size})`;
+
+    }
+
+    function collapse_code_tab() {
+        irVizDiv.style.display = 'block';
+        var rect = resizeBarAssembly.getBoundingClientRect();
+        irVizDiv.style.flexBasis = `${rect.left}px`;
+        codeDiv.style.display = 'none';
+    }
+
+    function collapseR_visualization_tab() {
+        codeDiv.style.display = 'block';
+        var rect = resizeBarAssembly.getBoundingClientRect();
+        codeDiv.style.flexBasis = `${rect.left}px`;
+        irVizDiv.style.display = 'none';
+    }
+
+    function collapseL_visualization_tab() {
+        assemblyCodeDiv.style.display = 'block';
+        var rect = resizeBar.getBoundingClientRect();
+        assemblyCodeDiv.style.flexBasis = `calc(100% - ${rect.right}px)`;
+        irVizDiv.style.display = 'none';
+    }
+
+    function collapse_assembly_tab() {
+        irVizDiv.style.display = 'block';
+        var rect = resizeBar.getBoundingClientRect();
+        irVizDiv.style.flexBasis = `calc(100% - ${rect.right}px)`;
+        assemblyCodeDiv.style.display = 'none';
+    }
+
+    // Tooltips
+    function update(buttonElement, tooltipElement) {
+        window.FloatingUIDOM.computePosition(buttonElement, tooltipElement, {
+            placement: 'top',
+            middleware: [
+                window.FloatingUIDOM.offset(6),
+                window.FloatingUIDOM.flip(),
+                window.FloatingUIDOM.shift({
+                    padding: 5
+                }),
+            ],
+        }).then(({
+            x,
+            y,
+            placement,
+            middlewareData
+        }) => {
+            Object.assign(tooltipElement.style, {
+                left: `${x}px`,
+                top: `${y}px`,
+            });
+            // Accessing the data
+            const staticSide = {
+                top: 'bottom',
+                right: 'left',
+                bottom: 'top',
+                left: 'right',
+            }[placement.split('-')[0]];
+        });
+    }
+
+    function showTooltip(buttonElement, tooltipElement) {
+        tooltipElement.style.display = 'block';
+        tooltipElement.style.opacity = '1';
+        update(buttonElement, tooltipElement);
+    }
+
+    function hideTooltip(tooltipElement) {
+        tooltipElement.style.display = '';
+        tooltipElement.style.opacity = '0';
+    }
+
+    function init_tooltips(btns, prefix) {
+        var re = /(?:\-([^-]+))?$/;
+        for (var i = 0; i < btns.size(); i++) {
+            const button = btns[i];
+            const tooltip = $(prefix + re.exec(button.id)[1])[0];
+            button.is_clicked = false;
+            button.addEventListener('mouseenter', () => {
+                if (!button.is_clicked)
+                    showTooltip(button, tooltip);
+            });
+            button.addEventListener('mouseleave', () => {
+                if (!button.is_clicked)
+                    hideTooltip(tooltip);
+            });
+            button.addEventListener('click', () => {
+                if (button.is_clicked) {
+                    hideTooltip(tooltip);
+                    button.is_clicked = false;
+                } else {
+                    showTooltip(button, tooltip);
+                    button.is_clicked = true;
+                }
+            });
+        }
+    }
+
+    init_tooltips($('button[id^="cond-"]'), "span#cond-tooltip-");
+    init_tooltips($('div[id^="cc-"]'), "span#tooltip-cc-");
+    init_tooltips($('div[id^="dc-"]'), "span#tooltip-dc-");
+    init_tooltips($('div[id^="vcc-"]'), "span#tooltip-vcc-");
+    init_tooltips($('div[id^="vdc-"]'), "span#tooltip-vdc-");
+
+    function depth(elem) {
+        if ($(elem).is("div#ir-visualization-tab"))
+            return 0;
+        else if ($(elem).is("div.box"))
+            return 1 + depth(elem.parentNode);
+        else
+            return depth(elem.parentNode);
+    }
+
+    // Collapse viz boxes beyond at depth > 1
+    $('div[id^="viz-"]').filter((idx, val) => {
+        return depth(val) > 1;
+    }).each((idx, val) => {
+        toggleViz(val.id);
+    });
+
+    // CodeMirror Js
+
+    // Populate assembly code
+    assemblyCodeDiv.style.display = 'block';
+    var codeHTML = document.getElementById('assemblyContent');
+    var code = codeHTML.textContent;
+    code = code.trimLeft();
+    document.getElementById('assembly-tab').innerHTML = '';
+    var assemblyCM = CodeMirror(document.getElementById('assembly-tab'), {
+        value: code,
+        lineNumbers: true,
+        lineWrapping: true,
+        mode: {
+            name: 'gas',
+            architecture: 'ARMv6'
+        },
+        readOnly: true,
+    });
+
+    function scrollToAsm(lno) {
+        assemblyCM.scrollIntoView({
+            line: assemblyCM.lineCount() - 1,
+            ch: 0
+        });
+        assemblyCM.scrollIntoView({
+            line: lno - 1,
+            ch: 0
+        });
+        assemblyCM.focus();
+        assemblyCM.setCursor({
+            line: lno,
+            ch: 0
+        });
+    }
+
+    scrollToAsm(1);
+    collapse_assembly_tab();
+
+    // Cost model js
+    var re = /(?:\-([^-]+))?$/;
+    var cost_btns = $('div[id^="cc-"], div[id^="dc-"]');
+    for (var i = 0; i < cost_btns.size(); i++) {
+        const button = cost_btns[i];
+        const highlight_span = $("span#cost-bg-" + re.exec(button.id)[1])[0];
+        $(button).mouseover(() => {
+            $(highlight_span).css("background", "#e5e3e3");
+        });
+        $(button).mouseout(() => {
+            $(highlight_span).css("background", "none");
+        });
+    }
+</script></html>
\ No newline at end of file
diff --git a/doc/book/random/vector_intrinsics.Rmd b/doc/book/random/vector_intrinsics.Rmd
index 78f5bf4ab..2f5bed0a0 100644
--- a/doc/book/random/vector_intrinsics.Rmd
+++ b/doc/book/random/vector_intrinsics.Rmd
@@ -453,6 +453,16 @@ After a bit of inspection, by reinstating progressively all the different
 parallelisms, we start to understand how the parallel patterns are implemented
 on assembly code.
 
+This is how it looks like.
+<iframe src="./random/separable_conv_2d.stmt.html" height="405" width="720"
+style="border: 1px solid #464646;" allowfullscreen>
+</iframe>
+
+It is not practical to view the embedded HTML page, so go to this <a
+href="./random/separable_conv_2d.stmt.html">link</a> below to fully explore the
+compiled statement as a full page:
+
+
 ### Vectorizing in the Convolution
 
 Eventually, this was not so hard and this gives a very tangible hands-on

From c17f01c78f02bd130a50100a28636ceaa13e6ab4 Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Fri, 12 Jan 2024 20:53:08 +0000
Subject: [PATCH 02/49] DOC: fix typo.

---
 doc/book/random/bilinear_interpolation.Rmd     | 12 ++++++------
 doc/book/random/python_code_that_suck_less.Rmd | 16 ++++++++--------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/doc/book/random/bilinear_interpolation.Rmd b/doc/book/random/bilinear_interpolation.Rmd
index e0d9299e4..57b4d4fb9 100644
--- a/doc/book/random/bilinear_interpolation.Rmd
+++ b/doc/book/random/bilinear_interpolation.Rmd
@@ -10,12 +10,12 @@ This is a technical exercise I do find interesting because it helps me to know
 whether someone can fix the technical insides of an existing pipeline. It's not
 simple but it's not that hard either.
 
-Eventually I lowered my expectations by just asking to complete a simple machine
-learning training pipeline. And everybody just jumped their guns and were happy
-to impress you about how much they know about the bleeding edge neural
-architecture and what not.^[The world never changes... Just shows how people
-most of the time likes to mansplain by displaying their knowledge more than
-their ability to think. So much ego all in all...]
+Eventually I lowered my expectations by just asking to complete applicants a
+simple machine learning training pipeline. And everybody just jumped their guns
+and were happy to impress you about how much they know about the bleeding edge
+neural architecture and what not.^[The world never changes... Just shows how
+people most of the time likes to mansplain by displaying their knowledge more
+than their ability to think. So much ego all in all...]
 
 Let's see how we can do this in Python in a NumPy-like style. I leave it as an
 exercise in C++.
diff --git a/doc/book/random/python_code_that_suck_less.Rmd b/doc/book/random/python_code_that_suck_less.Rmd
index 4d81825cf..976e3b7a3 100644
--- a/doc/book/random/python_code_that_suck_less.Rmd
+++ b/doc/book/random/python_code_that_suck_less.Rmd
@@ -108,8 +108,8 @@ Non-Maximum Suppression (NMS) algorithm.
 
 ### The algorithm described
 
-The NMS aims at filtering the best object boxes. The best object boxes must have
-the highest objectness score and each one of them can't overlap too much with
+The NMS aims at keeping the best object boxes. The best object boxes are those
+with the highest objectness score and each one of them overlap very little with
 each other.
 
 The objectness score is the probability that the object box contains an object
@@ -120,13 +120,13 @@ according to this definition.]
   objectness score.
 
 - Then we process greedily by
-  1. first keeping the first object box of the list, since it has the highest object
-     box.
+  1. first keeping the first object box of the list, since it has the highest
+     objectness score.
 
   2. Next we examine the second best object box and we keep if it does not
-     overlap with the first box or if it overlaps, it should overlap very
-     little. A good metric is the ratio of the **intersection** area between the
-     first box and second box *over the union* area, the so-called IoU score.
+     overlap with the first box or if it overlaps, it should do so very little.
+     A good metric is the ratio of the **intersection** area between the first
+     box and second box **over the union** area, the so-called IoU score.
 
      Let's imagine the second box does not overlap with the first one.
 
@@ -327,7 +327,7 @@ This is aesthetically better and more meaningful than
 ```{cpp}
 const auto& xijk = ...; // too crammed
 
-// The following is fine too as typing the underscore symbol can be a hassle.
+// The following is fine as typing the underscore symbol can be a hassle.
 // I am not a snake_case zealot either...
 const auto xi = x[i];
 ```

From d4d54cd13d89d527643ebb3de1469ff946a776bf Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 20 Jan 2024 01:52:11 +0000
Subject: [PATCH 03/49] Bump vite from 4.3.9 to 4.5.2 in /svelte/sara-app

Bumps [vite](https://github.com/vitejs/vite/tree/HEAD/packages/vite) from 4.3.9 to 4.5.2.
- [Release notes](https://github.com/vitejs/vite/releases)
- [Changelog](https://github.com/vitejs/vite/blob/v4.5.2/packages/vite/CHANGELOG.md)
- [Commits](https://github.com/vitejs/vite/commits/v4.5.2/packages/vite)

---
updated-dependencies:
- dependency-name: vite
  dependency-type: direct:development
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 svelte/sara-app/package.json   |   2 +-
 svelte/sara-app/pnpm-lock.yaml | 183 +++++++++++++++++----------------
 2 files changed, 94 insertions(+), 91 deletions(-)

diff --git a/svelte/sara-app/package.json b/svelte/sara-app/package.json
index a0cdf2925..4c2fdbaf8 100644
--- a/svelte/sara-app/package.json
+++ b/svelte/sara-app/package.json
@@ -20,7 +20,7 @@
 		"svelte-check": "^3.0.1",
 		"tslib": "^2.4.1",
 		"typescript": "^5.0.0",
-		"vite": "^4.3.0",
+		"vite": "^4.5.2",
     "autoprefixer": "^10.4.14",
     "daisyui": "^3.0.2",
     "postcss": "^8.4.31",
diff --git a/svelte/sara-app/pnpm-lock.yaml b/svelte/sara-app/pnpm-lock.yaml
index d2b1e2850..5885f4d62 100644
--- a/svelte/sara-app/pnpm-lock.yaml
+++ b/svelte/sara-app/pnpm-lock.yaml
@@ -10,7 +10,7 @@ devDependencies:
     version: 2.1.0(@sveltejs/kit@1.20.2)
   '@sveltejs/kit':
     specifier: ^1.5.0
-    version: 1.20.2(svelte@3.59.1)(vite@4.3.9)
+    version: 1.20.2(svelte@3.59.1)(vite@4.5.2)
   autoprefixer:
     specifier: ^10.4.14
     version: 10.4.14(postcss@8.4.31)
@@ -42,8 +42,8 @@ devDependencies:
     specifier: ^5.0.0
     version: 5.1.3
   vite:
-    specifier: ^4.3.0
-    version: 4.3.9
+    specifier: ^4.5.2
+    version: 4.5.2
 
 packages:
 
@@ -52,8 +52,8 @@ packages:
     engines: {node: '>=10'}
     dev: true
 
-  /@esbuild/android-arm64@0.17.19:
-    resolution: {integrity: sha512-KBMWvEZooR7+kzY0BtbTQn0OAYY7CsiydT63pVEaPtVYF0hXbUaOyZog37DKxK7NF3XacBJOpYT4adIJh+avxA==}
+  /@esbuild/android-arm64@0.18.20:
+    resolution: {integrity: sha512-Nz4rJcchGDtENV0eMKUNa6L12zz2zBDXuhj/Vjh18zGqB44Bi7MBMSXjgunJgjRhCmKOjnPuZp4Mb6OKqtMHLQ==}
     engines: {node: '>=12'}
     cpu: [arm64]
     os: [android]
@@ -61,8 +61,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/android-arm@0.17.19:
-    resolution: {integrity: sha512-rIKddzqhmav7MSmoFCmDIb6e2W57geRsM94gV2l38fzhXMwq7hZoClug9USI2pFRGL06f4IOPHHpFNOkWieR8A==}
+  /@esbuild/android-arm@0.18.20:
+    resolution: {integrity: sha512-fyi7TDI/ijKKNZTUJAQqiG5T7YjJXgnzkURqmGj13C6dCqckZBLdl4h7bkhHt/t0WP+zO9/zwroDvANaOqO5Sw==}
     engines: {node: '>=12'}
     cpu: [arm]
     os: [android]
@@ -70,8 +70,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/android-x64@0.17.19:
-    resolution: {integrity: sha512-uUTTc4xGNDT7YSArp/zbtmbhO0uEEK9/ETW29Wk1thYUJBz3IVnvgEiEwEa9IeLyvnpKrWK64Utw2bgUmDveww==}
+  /@esbuild/android-x64@0.18.20:
+    resolution: {integrity: sha512-8GDdlePJA8D6zlZYJV/jnrRAi6rOiNaCC/JclcXpB+KIuvfBN4owLtgzY2bsxnx666XjJx2kDPUmnTtR8qKQUg==}
     engines: {node: '>=12'}
     cpu: [x64]
     os: [android]
@@ -79,8 +79,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/darwin-arm64@0.17.19:
-    resolution: {integrity: sha512-80wEoCfF/hFKM6WE1FyBHc9SfUblloAWx6FJkFWTWiCoht9Mc0ARGEM47e67W9rI09YoUxJL68WHfDRYEAvOhg==}
+  /@esbuild/darwin-arm64@0.18.20:
+    resolution: {integrity: sha512-bxRHW5kHU38zS2lPTPOyuyTm+S+eobPUnTNkdJEfAddYgEcll4xkT8DB9d2008DtTbl7uJag2HuE5NZAZgnNEA==}
     engines: {node: '>=12'}
     cpu: [arm64]
     os: [darwin]
@@ -88,8 +88,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/darwin-x64@0.17.19:
-    resolution: {integrity: sha512-IJM4JJsLhRYr9xdtLytPLSH9k/oxR3boaUIYiHkAawtwNOXKE8KoU8tMvryogdcT8AU+Bflmh81Xn6Q0vTZbQw==}
+  /@esbuild/darwin-x64@0.18.20:
+    resolution: {integrity: sha512-pc5gxlMDxzm513qPGbCbDukOdsGtKhfxD1zJKXjCCcU7ju50O7MeAZ8c4krSJcOIJGFR+qx21yMMVYwiQvyTyQ==}
     engines: {node: '>=12'}
     cpu: [x64]
     os: [darwin]
@@ -97,8 +97,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/freebsd-arm64@0.17.19:
-    resolution: {integrity: sha512-pBwbc7DufluUeGdjSU5Si+P3SoMF5DQ/F/UmTSb8HXO80ZEAJmrykPyzo1IfNbAoaqw48YRpv8shwd1NoI0jcQ==}
+  /@esbuild/freebsd-arm64@0.18.20:
+    resolution: {integrity: sha512-yqDQHy4QHevpMAaxhhIwYPMv1NECwOvIpGCZkECn8w2WFHXjEwrBn3CeNIYsibZ/iZEUemj++M26W3cNR5h+Tw==}
     engines: {node: '>=12'}
     cpu: [arm64]
     os: [freebsd]
@@ -106,8 +106,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/freebsd-x64@0.17.19:
-    resolution: {integrity: sha512-4lu+n8Wk0XlajEhbEffdy2xy53dpR06SlzvhGByyg36qJw6Kpfk7cp45DR/62aPH9mtJRmIyrXAS5UWBrJT6TQ==}
+  /@esbuild/freebsd-x64@0.18.20:
+    resolution: {integrity: sha512-tgWRPPuQsd3RmBZwarGVHZQvtzfEBOreNuxEMKFcd5DaDn2PbBxfwLcj4+aenoh7ctXcbXmOQIn8HI6mCSw5MQ==}
     engines: {node: '>=12'}
     cpu: [x64]
     os: [freebsd]
@@ -115,8 +115,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/linux-arm64@0.17.19:
-    resolution: {integrity: sha512-ct1Tg3WGwd3P+oZYqic+YZF4snNl2bsnMKRkb3ozHmnM0dGWuxcPTTntAF6bOP0Sp4x0PjSF+4uHQ1xvxfRKqg==}
+  /@esbuild/linux-arm64@0.18.20:
+    resolution: {integrity: sha512-2YbscF+UL7SQAVIpnWvYwM+3LskyDmPhe31pE7/aoTMFKKzIc9lLbyGUpmmb8a8AixOL61sQ/mFh3jEjHYFvdA==}
     engines: {node: '>=12'}
     cpu: [arm64]
     os: [linux]
@@ -124,8 +124,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/linux-arm@0.17.19:
-    resolution: {integrity: sha512-cdmT3KxjlOQ/gZ2cjfrQOtmhG4HJs6hhvm3mWSRDPtZ/lP5oe8FWceS10JaSJC13GBd4eH/haHnqf7hhGNLerA==}
+  /@esbuild/linux-arm@0.18.20:
+    resolution: {integrity: sha512-/5bHkMWnq1EgKr1V+Ybz3s1hWXok7mDFUMQ4cG10AfW3wL02PSZi5kFpYKrptDsgb2WAJIvRcDm+qIvXf/apvg==}
     engines: {node: '>=12'}
     cpu: [arm]
     os: [linux]
@@ -133,8 +133,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/linux-ia32@0.17.19:
-    resolution: {integrity: sha512-w4IRhSy1VbsNxHRQpeGCHEmibqdTUx61Vc38APcsRbuVgK0OPEnQ0YD39Brymn96mOx48Y2laBQGqgZ0j9w6SQ==}
+  /@esbuild/linux-ia32@0.18.20:
+    resolution: {integrity: sha512-P4etWwq6IsReT0E1KHU40bOnzMHoH73aXp96Fs8TIT6z9Hu8G6+0SHSw9i2isWrD2nbx2qo5yUqACgdfVGx7TA==}
     engines: {node: '>=12'}
     cpu: [ia32]
     os: [linux]
@@ -142,8 +142,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/linux-loong64@0.17.19:
-    resolution: {integrity: sha512-2iAngUbBPMq439a+z//gE+9WBldoMp1s5GWsUSgqHLzLJ9WoZLZhpwWuym0u0u/4XmZ3gpHmzV84PonE+9IIdQ==}
+  /@esbuild/linux-loong64@0.18.20:
+    resolution: {integrity: sha512-nXW8nqBTrOpDLPgPY9uV+/1DjxoQ7DoB2N8eocyq8I9XuqJ7BiAMDMf9n1xZM9TgW0J8zrquIb/A7s3BJv7rjg==}
     engines: {node: '>=12'}
     cpu: [loong64]
     os: [linux]
@@ -151,8 +151,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/linux-mips64el@0.17.19:
-    resolution: {integrity: sha512-LKJltc4LVdMKHsrFe4MGNPp0hqDFA1Wpt3jE1gEyM3nKUvOiO//9PheZZHfYRfYl6AwdTH4aTcXSqBerX0ml4A==}
+  /@esbuild/linux-mips64el@0.18.20:
+    resolution: {integrity: sha512-d5NeaXZcHp8PzYy5VnXV3VSd2D328Zb+9dEq5HE6bw6+N86JVPExrA6O68OPwobntbNJ0pzCpUFZTo3w0GyetQ==}
     engines: {node: '>=12'}
     cpu: [mips64el]
     os: [linux]
@@ -160,8 +160,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/linux-ppc64@0.17.19:
-    resolution: {integrity: sha512-/c/DGybs95WXNS8y3Ti/ytqETiW7EU44MEKuCAcpPto3YjQbyK3IQVKfF6nbghD7EcLUGl0NbiL5Rt5DMhn5tg==}
+  /@esbuild/linux-ppc64@0.18.20:
+    resolution: {integrity: sha512-WHPyeScRNcmANnLQkq6AfyXRFr5D6N2sKgkFo2FqguP44Nw2eyDlbTdZwd9GYk98DZG9QItIiTlFLHJHjxP3FA==}
     engines: {node: '>=12'}
     cpu: [ppc64]
     os: [linux]
@@ -169,8 +169,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/linux-riscv64@0.17.19:
-    resolution: {integrity: sha512-FC3nUAWhvFoutlhAkgHf8f5HwFWUL6bYdvLc/TTuxKlvLi3+pPzdZiFKSWz/PF30TB1K19SuCxDTI5KcqASJqA==}
+  /@esbuild/linux-riscv64@0.18.20:
+    resolution: {integrity: sha512-WSxo6h5ecI5XH34KC7w5veNnKkju3zBRLEQNY7mv5mtBmrP/MjNBCAlsM2u5hDBlS3NGcTQpoBvRzqBcRtpq1A==}
     engines: {node: '>=12'}
     cpu: [riscv64]
     os: [linux]
@@ -178,8 +178,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/linux-s390x@0.17.19:
-    resolution: {integrity: sha512-IbFsFbxMWLuKEbH+7sTkKzL6NJmG2vRyy6K7JJo55w+8xDk7RElYn6xvXtDW8HCfoKBFK69f3pgBJSUSQPr+4Q==}
+  /@esbuild/linux-s390x@0.18.20:
+    resolution: {integrity: sha512-+8231GMs3mAEth6Ja1iK0a1sQ3ohfcpzpRLH8uuc5/KVDFneH6jtAJLFGafpzpMRO6DzJ6AvXKze9LfFMrIHVQ==}
     engines: {node: '>=12'}
     cpu: [s390x]
     os: [linux]
@@ -187,8 +187,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/linux-x64@0.17.19:
-    resolution: {integrity: sha512-68ngA9lg2H6zkZcyp22tsVt38mlhWde8l3eJLWkyLrp4HwMUr3c1s/M2t7+kHIhvMjglIBrFpncX1SzMckomGw==}
+  /@esbuild/linux-x64@0.18.20:
+    resolution: {integrity: sha512-UYqiqemphJcNsFEskc73jQ7B9jgwjWrSayxawS6UVFZGWrAAtkzjxSqnoclCXxWtfwLdzU+vTpcNYhpn43uP1w==}
     engines: {node: '>=12'}
     cpu: [x64]
     os: [linux]
@@ -196,8 +196,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/netbsd-x64@0.17.19:
-    resolution: {integrity: sha512-CwFq42rXCR8TYIjIfpXCbRX0rp1jo6cPIUPSaWwzbVI4aOfX96OXY8M6KNmtPcg7QjYeDmN+DD0Wp3LaBOLf4Q==}
+  /@esbuild/netbsd-x64@0.18.20:
+    resolution: {integrity: sha512-iO1c++VP6xUBUmltHZoMtCUdPlnPGdBom6IrO4gyKPFFVBKioIImVooR5I83nTew5UOYrk3gIJhbZh8X44y06A==}
     engines: {node: '>=12'}
     cpu: [x64]
     os: [netbsd]
@@ -205,8 +205,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/openbsd-x64@0.17.19:
-    resolution: {integrity: sha512-cnq5brJYrSZ2CF6c35eCmviIN3k3RczmHz8eYaVlNasVqsNY+JKohZU5MKmaOI+KkllCdzOKKdPs762VCPC20g==}
+  /@esbuild/openbsd-x64@0.18.20:
+    resolution: {integrity: sha512-e5e4YSsuQfX4cxcygw/UCPIEP6wbIL+se3sxPdCiMbFLBWu0eiZOJ7WoD+ptCLrmjZBK1Wk7I6D/I3NglUGOxg==}
     engines: {node: '>=12'}
     cpu: [x64]
     os: [openbsd]
@@ -214,8 +214,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/sunos-x64@0.17.19:
-    resolution: {integrity: sha512-vCRT7yP3zX+bKWFeP/zdS6SqdWB8OIpaRq/mbXQxTGHnIxspRtigpkUcDMlSCOejlHowLqII7K2JKevwyRP2rg==}
+  /@esbuild/sunos-x64@0.18.20:
+    resolution: {integrity: sha512-kDbFRFp0YpTQVVrqUd5FTYmWo45zGaXe0X8E1G/LKFC0v8x0vWrhOWSLITcCn63lmZIxfOMXtCfti/RxN/0wnQ==}
     engines: {node: '>=12'}
     cpu: [x64]
     os: [sunos]
@@ -223,8 +223,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/win32-arm64@0.17.19:
-    resolution: {integrity: sha512-yYx+8jwowUstVdorcMdNlzklLYhPxjniHWFKgRqH7IFlUEa0Umu3KuYplf1HUZZ422e3NU9F4LGb+4O0Kdcaag==}
+  /@esbuild/win32-arm64@0.18.20:
+    resolution: {integrity: sha512-ddYFR6ItYgoaq4v4JmQQaAI5s7npztfV4Ag6NrhiaW0RrnOXqBkgwZLofVTlq1daVTQNhtI5oieTvkRPfZrePg==}
     engines: {node: '>=12'}
     cpu: [arm64]
     os: [win32]
@@ -232,8 +232,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/win32-ia32@0.17.19:
-    resolution: {integrity: sha512-eggDKanJszUtCdlVs0RB+h35wNlb5v4TWEkq4vZcmVt5u/HiDZrTXe2bWFQUez3RgNHwx/x4sk5++4NSSicKkw==}
+  /@esbuild/win32-ia32@0.18.20:
+    resolution: {integrity: sha512-Wv7QBi3ID/rROT08SABTS7eV4hX26sVduqDOTe1MvGMjNd3EjOz4b7zeexIR62GTIEKrfJXKL9LFxTYgkyeu7g==}
     engines: {node: '>=12'}
     cpu: [ia32]
     os: [win32]
@@ -241,8 +241,8 @@ packages:
     dev: true
     optional: true
 
-  /@esbuild/win32-x64@0.17.19:
-    resolution: {integrity: sha512-lAhycmKnVOuRYNtRtatQR1LPQf2oYCkRGkSFnseDAKPl8lu5SOsK/e1sXe5a0Pc5kHIHe6P2I/ilntNv2xf3cA==}
+  /@esbuild/win32-x64@0.18.20:
+    resolution: {integrity: sha512-kTdfRcSiDfQca/y9QIkng02avJ+NCaQvrMejlsB3RRv5sE9rRoeBPISaZpKxHELzRxZyLvNts1P27W3wV+8geQ==}
     engines: {node: '>=12'}
     cpu: [x64]
     os: [win32]
@@ -314,11 +314,11 @@ packages:
     peerDependencies:
       '@sveltejs/kit': ^1.0.0
     dependencies:
-      '@sveltejs/kit': 1.20.2(svelte@3.59.1)(vite@4.3.9)
+      '@sveltejs/kit': 1.20.2(svelte@3.59.1)(vite@4.5.2)
       import-meta-resolve: 3.0.0
     dev: true
 
-  /@sveltejs/kit@1.20.2(svelte@3.59.1)(vite@4.3.9):
+  /@sveltejs/kit@1.20.2(svelte@3.59.1)(vite@4.5.2):
     resolution: {integrity: sha512-MtR1i+HtmYWcRgtubw1GQqT/+CWXL/z24PegE0xYAdObbhdr7YtEfmoe705D/JZMtMmoPXrmSk4W0MfL5A3lYw==}
     engines: {node: ^16.14 || >=18}
     hasBin: true
@@ -327,7 +327,7 @@ packages:
       svelte: ^3.54.0 || ^4.0.0-next.0
       vite: ^4.0.0
     dependencies:
-      '@sveltejs/vite-plugin-svelte': 2.4.1(svelte@3.59.1)(vite@4.3.9)
+      '@sveltejs/vite-plugin-svelte': 2.4.1(svelte@3.59.1)(vite@4.5.2)
       '@types/cookie': 0.5.1
       cookie: 0.5.0
       devalue: 4.3.2
@@ -341,12 +341,12 @@ packages:
       svelte: 3.59.1
       tiny-glob: 0.2.9
       undici: 5.22.1
-      vite: 4.3.9
+      vite: 4.5.2
     transitivePeerDependencies:
       - supports-color
     dev: true
 
-  /@sveltejs/vite-plugin-svelte-inspector@1.0.2(@sveltejs/vite-plugin-svelte@2.4.1)(svelte@3.59.1)(vite@4.3.9):
+  /@sveltejs/vite-plugin-svelte-inspector@1.0.2(@sveltejs/vite-plugin-svelte@2.4.1)(svelte@3.59.1)(vite@4.5.2):
     resolution: {integrity: sha512-Cy1dUMcYCnDVV/hPLXa43YZJ2jGKVW5rA0xuNL9dlmYhT0yoS1g7+FOFSRlgk0BXKk/Oc7grs+8BVA5Iz2fr8A==}
     engines: {node: ^14.18.0 || >= 16}
     peerDependencies:
@@ -354,30 +354,30 @@ packages:
       svelte: ^3.54.0 || ^4.0.0-next.0
       vite: ^4.0.0
     dependencies:
-      '@sveltejs/vite-plugin-svelte': 2.4.1(svelte@3.59.1)(vite@4.3.9)
+      '@sveltejs/vite-plugin-svelte': 2.4.1(svelte@3.59.1)(vite@4.5.2)
       debug: 4.3.4
       svelte: 3.59.1
-      vite: 4.3.9
+      vite: 4.5.2
     transitivePeerDependencies:
       - supports-color
     dev: true
 
-  /@sveltejs/vite-plugin-svelte@2.4.1(svelte@3.59.1)(vite@4.3.9):
+  /@sveltejs/vite-plugin-svelte@2.4.1(svelte@3.59.1)(vite@4.5.2):
     resolution: {integrity: sha512-bNNKvoRY89ptY7udeBSCmTdCVwkjmMcZ0j/z9J5MuedT8jPjq0zrknAo/jF1sToAza4NVaAgR9AkZoD9oJJmnA==}
     engines: {node: ^14.18.0 || >= 16}
     peerDependencies:
       svelte: ^3.54.0 || ^4.0.0-next.0
       vite: ^4.0.0
     dependencies:
-      '@sveltejs/vite-plugin-svelte-inspector': 1.0.2(@sveltejs/vite-plugin-svelte@2.4.1)(svelte@3.59.1)(vite@4.3.9)
+      '@sveltejs/vite-plugin-svelte-inspector': 1.0.2(@sveltejs/vite-plugin-svelte@2.4.1)(svelte@3.59.1)(vite@4.5.2)
       debug: 4.3.4
       deepmerge: 4.3.1
       kleur: 4.1.5
       magic-string: 0.30.0
       svelte: 3.59.1
       svelte-hmr: 0.15.2(svelte@3.59.1)
-      vite: 4.3.9
-      vitefu: 0.2.4(vite@4.3.9)
+      vite: 4.5.2
+      vitefu: 0.2.4(vite@4.5.2)
     transitivePeerDependencies:
       - supports-color
     dev: true
@@ -584,34 +584,34 @@ packages:
     resolution: {integrity: sha512-SOp9Phqvqn7jtEUxPWdWfWoLmyt2VaJ6MpvP9Comy1MceMXqE6bxvaTu4iaxpYYPzhny28Lc+M87/c2cPK6lDg==}
     dev: true
 
-  /esbuild@0.17.19:
-    resolution: {integrity: sha512-XQ0jAPFkK/u3LcVRcvVHQcTIqD6E2H1fvZMA5dQPSOWb3suUbWbfbRf94pjc0bNzRYLfIrDRQXr7X+LHIm5oHw==}
+  /esbuild@0.18.20:
+    resolution: {integrity: sha512-ceqxoedUrcayh7Y7ZX6NdbbDzGROiyVBgC4PriJThBKSVPWnnFHZAkfI1lJT8QFkOwH4qOS2SJkS4wvpGl8BpA==}
     engines: {node: '>=12'}
     hasBin: true
     requiresBuild: true
     optionalDependencies:
-      '@esbuild/android-arm': 0.17.19
-      '@esbuild/android-arm64': 0.17.19
-      '@esbuild/android-x64': 0.17.19
-      '@esbuild/darwin-arm64': 0.17.19
-      '@esbuild/darwin-x64': 0.17.19
-      '@esbuild/freebsd-arm64': 0.17.19
-      '@esbuild/freebsd-x64': 0.17.19
-      '@esbuild/linux-arm': 0.17.19
-      '@esbuild/linux-arm64': 0.17.19
-      '@esbuild/linux-ia32': 0.17.19
-      '@esbuild/linux-loong64': 0.17.19
-      '@esbuild/linux-mips64el': 0.17.19
-      '@esbuild/linux-ppc64': 0.17.19
-      '@esbuild/linux-riscv64': 0.17.19
-      '@esbuild/linux-s390x': 0.17.19
-      '@esbuild/linux-x64': 0.17.19
-      '@esbuild/netbsd-x64': 0.17.19
-      '@esbuild/openbsd-x64': 0.17.19
-      '@esbuild/sunos-x64': 0.17.19
-      '@esbuild/win32-arm64': 0.17.19
-      '@esbuild/win32-ia32': 0.17.19
-      '@esbuild/win32-x64': 0.17.19
+      '@esbuild/android-arm': 0.18.20
+      '@esbuild/android-arm64': 0.18.20
+      '@esbuild/android-x64': 0.18.20
+      '@esbuild/darwin-arm64': 0.18.20
+      '@esbuild/darwin-x64': 0.18.20
+      '@esbuild/freebsd-arm64': 0.18.20
+      '@esbuild/freebsd-x64': 0.18.20
+      '@esbuild/linux-arm': 0.18.20
+      '@esbuild/linux-arm64': 0.18.20
+      '@esbuild/linux-ia32': 0.18.20
+      '@esbuild/linux-loong64': 0.18.20
+      '@esbuild/linux-mips64el': 0.18.20
+      '@esbuild/linux-ppc64': 0.18.20
+      '@esbuild/linux-riscv64': 0.18.20
+      '@esbuild/linux-s390x': 0.18.20
+      '@esbuild/linux-x64': 0.18.20
+      '@esbuild/netbsd-x64': 0.18.20
+      '@esbuild/openbsd-x64': 0.18.20
+      '@esbuild/sunos-x64': 0.18.20
+      '@esbuild/win32-arm64': 0.18.20
+      '@esbuild/win32-ia32': 0.18.20
+      '@esbuild/win32-x64': 0.18.20
     dev: true
 
   /escalade@3.1.1:
@@ -1075,8 +1075,8 @@ packages:
       glob: 7.2.3
     dev: true
 
-  /rollup@3.25.1:
-    resolution: {integrity: sha512-tywOR+rwIt5m2ZAWSe5AIJcTat8vGlnPFAv15ycCrw33t6iFsXZ6mzHVFh2psSjxQPmI+xgzMZZizUAukBI4aQ==}
+  /rollup@3.29.4:
+    resolution: {integrity: sha512-oWzmBZwvYrU0iJHtDmhsm662rC15FRXmcjCk1xD771dFDx5jJ02ufAQQTn0etB2emNk4J9EZg/yWKpsn9BWGRw==}
     engines: {node: '>=14.18.0', npm: '>=8.0.0'}
     hasBin: true
     optionalDependencies:
@@ -1353,13 +1353,14 @@ packages:
     resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
     dev: true
 
-  /vite@4.3.9:
-    resolution: {integrity: sha512-qsTNZjO9NoJNW7KnOrgYwczm0WctJ8m/yqYAMAK9Lxt4SoySUfS5S8ia9K7JHpa3KEeMfyF8LoJ3c5NeBJy6pg==}
+  /vite@4.5.2:
+    resolution: {integrity: sha512-tBCZBNSBbHQkaGyhGCDUGqeo2ph8Fstyp6FMSvTtsXeZSPpSMGlviAOav2hxVTqFcx8Hj/twtWKsMJXNY0xI8w==}
     engines: {node: ^14.18.0 || >=16.0.0}
     hasBin: true
     peerDependencies:
       '@types/node': '>= 14'
       less: '*'
+      lightningcss: ^1.21.0
       sass: '*'
       stylus: '*'
       sugarss: '*'
@@ -1369,6 +1370,8 @@ packages:
         optional: true
       less:
         optional: true
+      lightningcss:
+        optional: true
       sass:
         optional: true
       stylus:
@@ -1378,14 +1381,14 @@ packages:
       terser:
         optional: true
     dependencies:
-      esbuild: 0.17.19
+      esbuild: 0.18.20
       postcss: 8.4.31
-      rollup: 3.25.1
+      rollup: 3.29.4
     optionalDependencies:
       fsevents: 2.3.2
     dev: true
 
-  /vitefu@0.2.4(vite@4.3.9):
+  /vitefu@0.2.4(vite@4.5.2):
     resolution: {integrity: sha512-fanAXjSaf9xXtOOeno8wZXIhgia+CZury481LsDaV++lSvcU2R9Ch2bPh3PYFyoHW+w9LqAeYRISVQjUIew14g==}
     peerDependencies:
       vite: ^3.0.0 || ^4.0.0
@@ -1393,7 +1396,7 @@ packages:
       vite:
         optional: true
     dependencies:
-      vite: 4.3.9
+      vite: 4.5.2
     dev: true
 
   /wrappy@1.0.2:

From d34eee1c5d3cf598f8a89a4407b639f75f1d30a1 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Fri, 29 Mar 2024 13:10:43 +0000
Subject: [PATCH 04/49] WIP: rewording.

---
 doc/book/random/vector_intrinsics.Rmd | 126 +++++++++++++++-----------
 1 file changed, 74 insertions(+), 52 deletions(-)

diff --git a/doc/book/random/vector_intrinsics.Rmd b/doc/book/random/vector_intrinsics.Rmd
index 2f5bed0a0..9ea3439ff 100644
--- a/doc/book/random/vector_intrinsics.Rmd
+++ b/doc/book/random/vector_intrinsics.Rmd
@@ -1,51 +1,51 @@
 # Super Fast Separable Convolutions
 
 Sounds boring? I promise you this is going to be way more interesting than you
-might think. There's quite a bit interesting things to learn.
+might think. While this is a common operation in low-level operation, there is
+quite a few interesting things to learn how to leverage the hardware to
+accelerate computing.
 
 Story time before we dive into the implementation of the Gaussian blur.
 
 ## Story Time
 
-Once I applied for a C++ technical leadership role for some company. I was
-considered for the role after a preliminary behavioral screening by the CTO. He
-then told me who I would be interviewing with in the next interviews. A few days
-later, he called me back for a quick chat. He ended up telling me they would not
-move forward after examining sara's image processing code. Without really
-explaining why, I guessed that probably one of their senior engineers
-disqualified me as he did care more about my ability in understanding and
-manipulating CPU vector instructions.
+Once, I applied for a C++ technical leadership role for some company. I did a
+behavioral interview with their CTO. He then told me who I would be interviewing
+with in the next interviews. A few days later, he called me back for a quick
+chat. He ended up telling me they would not move forward after examining sara's
+image processing code. Without really explaining why, the most likely reason I
+could guess was that one of their senior engineers disqualified me because all
+he cared about was whether I could understand and write code with CPU vector
+instructions.
 
 The CTO profusedly apologized to me. He said politely that I certainly was
-gifted but my C++ code was not to their standard. From what I guessed, they
-probably found that my image processing code was implemented too naively.
+gifted but my code was not up to their standard. Indeed they must have deemed that
+my image processing code was implemented too naively.
 
-This was based on the fact that the CTO told me they were doing lots of
+During the behavioral interview, the CTO told me they were doing lots of
 optimization involving CPU vector intrinsics in order to calculate data as fast
 as possible. That it really was not that difficult and that I could get up to
-speed fairly quickly.
-
-I have mostly an applied math background, so it did sound unfair to me. It did
-made me feel that you are never enough whatever you achieve. Like you do need to
-know every single aspect of engineering from high level to low level. In
-hindsight I would not have been happy with the job anyways. Still frustrating...
-In that moment, I was telling myself: what can you do when you are already being
-shut the door? Going by the same logic, if David Lowe showcased his SIFT
-research code, he would not qualify for the job either: I learnt from studying
-his code.
-
-Right, back to the topic: today how can we achieve that? Nowadays, we have some
-answers with Halide. And we can do it very elegantly without coding in assembly
-directly.
+speed fairly quickly. That was the main reason I believe they disqualified me.
+
+Having mostly an applied math background, it did sound unfair and hypocritical
+to me. Hypocritical because, if it really was easy, then why can you not learn
+on the job?
+So it did make me feel that you are never enough whatever you achieve in life.
+Oh, so you're supposed to master every single damn thing of software engineering
+from high level to low level when you start a job?
+Going by the same logic, if David Lowe showcased his SIFT research code, he
+would not qualify for the job either since I learnt from studying his code.
+
+End of the rant... In hindsight I would not have been happy with the job
+anyways. Back to the topic: today how can we achieve that? Nowadays, we have
+some answers with Halide. And we can do it very elegantly without coding in
+assembly code directly.
 
 ## What is Halide?
 
-With Halide you can write a an image processing filter and optimize the way it is run.
-
-You write sets of arithmetic instructions that operates on image buffers with
-specific parallelism patterns (multicore and vectorization). Then you can tell
-to compile with a C++ method to generate the optimize the image filter as a C++
-static library.
+With Halide you can write a an image processing filter and optimize for each
+specific hardware and architecture. All of this in a *very few lines of code*.
+Then Halide will compile the resulting code into usable assembly code.
 
 The main beauty with Halide is that you can decouple:
 
@@ -57,17 +57,39 @@ Halide can check and provide guarantees that your algorithm remains correct for
 the schedule you are implementing.
 
 With Halide, you won't write any nested `for` loops and multi-threaded
-processing with bound-checking. So you can express ideas at a higher level.
+processing with bound-checking. Instead you express these ideas at a higher
+level.
+
+Halide abstracts the different kind of parallelisms for you and supports a wide
+range of platforms. You don't have to be an expert in CPU vector intrinsics, but
+you do need to know the schedule strategies to say optimize the speed at which
+your convolutional operations run.
 
-Halide abstracts these parallelisms for you and supports a wide range of
-platforms. You don't have to be an expert in CPU vector intrinsics, but you do
-need to know the schedule strategies to say optimize the speed at which your
-convolutional operations run. Halide has identified the most common schedule
-patterns that are used to optimize image processing code.
+You still need to skim through the publications, the presentations and practise.
+But in my experience, it is still difficult for the layman or the novice to
+identify the schedule patterns that work and those that don't.
 
-You still need to skim the publications and the presentations, practise. But in
-my experience, it is still difficult for the layman or the novice to identify
-the schedule patterns that work and those that don't.
+### Halide vs Explicit CPU intrisic code
+
+Naysayers be like: "but Halide is too high level and does too much magic!"
+
+Optimizing algorithms by writing explicitly CPU intrinsic instructions can
+certainly be done. But you would have to pay a very costly engineering price.
+You have to optimize for different CPU platforms: x86-64, ARM, RISC-V and learn
+about their different intrinsic API. The resulting code is much harder to
+maintain than using a unified language that allows you to write these in a very
+lines of codes.
+
+Unless this is your full-time job and something you really want to learn, that's
+not something a computer vision scientist like me would like to spend time to...
+Halide has done an excellent job in abstracting this at the least on the CPU
+side. So know who you are and decide what you want to do.
+
+Let's conclude this paragraph with a few words regarding the GPU acceleration
+with Halide. On the GPU side, Halide is indeed not yet mature. For one thing,
+the documentation is still lacking regarding the memory model at this time of
+writing. You will be much better off writing code in CUDA, Vulkan, or OpenCL to
+fully control your GPU.
 
 
 ## Naive C++ Implementation of the Gaussian Blur
@@ -81,11 +103,11 @@ the big lie: don't prematurely optimize and just let the compiler do its job.
 And just make things work.
 
 I did learn in class that the Gaussian filter is separable: Don't write the 2D
-convolution naively, exploit its separable property. I am proudly exhibiting a
-naive implementation in C++ in my Sara repo, something along the lines:
-
-Icing on the cake, I have discovered multicore parallelism via OpenMP.
+convolution naively, exploit its separable property. Icing on the cake, I have
+discovered multicore parallelism via OpenMP. I am proudly exhibiting a naive
+implementation in C++ in my Sara repo, something along the lines below.
 
+Below is the first step, that is the x-convolution.
 ```{Rcpp}
 auto conv_x(const float *in, const float *kernel, float *out,
             const int w, const int h, const int ksz) -> void
@@ -121,7 +143,7 @@ auto conv_x(const float *in, const float *kernel, float *out,
 }
 ```
 
-We execute the same dance for the y-convolution:
+Then we perform the same dance for the y-convolution:
 
 ```{Rcpp}
 auto conv_y(const float *in, const float *kernel, float *out,
@@ -173,13 +195,13 @@ enough without elaborating why and what he was expecting to see.
 
 Fine! let's see how we could optimize the code...
 
-OK How? I vaguely understand you have to write the code with CPU intrinsics?
-Should I write in C-style or in ASM. How to do it with limited bandwidth after
-finishing your day job and wanting to learn?
+I vaguely understand we have to rewrite the code explicitly with CPU intrinsics?
+Should I write it in C-style or in ASM code? How to do it with limited bandwidth
+after finishing your day job and wanting to learn?
 
-Use Halide? Yes! It is a very elegant language. It can also compile your code
-directly in OpenCL and Vulkan, Direct3D bit code. Here is how we could rewrite
-the 2D separable convolution.
+Can we use Halide? Yes! It is a very elegant language. It can also compile your
+code directly in OpenCL and Vulkan, Direct3D bit code. Here is how we could
+rewrite the 2D separable convolution.
 
 Let us write the implementation first.
 

From 4b02615497e71a470bf99fd76fadbd6e220b85dd Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Sun, 31 Mar 2024 15:31:40 +0100
Subject: [PATCH 05/49] MAINT: update to Halide 17 and LLVM 17 for MacOS.

---
 build.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.py b/build.py
index 512a9eefb..59507912f 100755
--- a/build.py
+++ b/build.py
@@ -28,7 +28,7 @@
 CUDA_VERSION = "12.1.0"
 TRT_VERSION = "8.6"
 SWIFT_VERSION = "5.9.1"
-HALIDE_VERSION = "16.0.0"
+HALIDE_VERSION = "17.0.0"
 
 # Docker
 SARA_SOURCE_DIR = pathlib.Path(__file__).parent.resolve()
@@ -186,7 +186,7 @@ def generate_project(
         my_cmake_prefix_paths.append(HALIDE_ROOT_PATH)
     elif SYSTEM == "Darwin":
         cmake_options.append("-D SARA_USE_HALIDE:BOOL=ON")
-        llvm_dir = subprocess.check_output(["brew", "--prefix", "llvm@16"])
+        llvm_dir = subprocess.check_output(["brew", "--prefix", "llvm"])
         llvm_dir = llvm_dir.decode(sys.stdout.encoding).strip()
         llvm_cmake_dir = pathlib.Path(llvm_dir) / "lib" / "cmake" / "llvm"
         cmake_options.append(f"-D LLVM_DIR={llvm_cmake_dir}")

From b04a7f42e4b7c00e055b9fa7fc1c7ca11f434536 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Mon, 1 Apr 2024 20:30:29 +0100
Subject: [PATCH 06/49] DOC: rewrite.

---
 doc/book/random/vector_intrinsics.Rmd | 282 +++++++++++++++-----------
 1 file changed, 168 insertions(+), 114 deletions(-)

diff --git a/doc/book/random/vector_intrinsics.Rmd b/doc/book/random/vector_intrinsics.Rmd
index 9ea3439ff..b658348b7 100644
--- a/doc/book/random/vector_intrinsics.Rmd
+++ b/doc/book/random/vector_intrinsics.Rmd
@@ -19,8 +19,8 @@ he cared about was whether I could understand and write code with CPU vector
 instructions.
 
 The CTO profusedly apologized to me. He said politely that I certainly was
-gifted but my code was not up to their standard. Indeed they must have deemed that
-my image processing code was implemented too naively.
+gifted but my code was not up to their standard. They must have deemed that my
+image processing code was implemented too naively.
 
 During the behavioral interview, the CTO told me they were doing lots of
 optimization involving CPU vector intrinsics in order to calculate data as fast
@@ -36,54 +36,66 @@ from high level to low level when you start a job?
 Going by the same logic, if David Lowe showcased his SIFT research code, he
 would not qualify for the job either since I learnt from studying his code.
 
-End of the rant... In hindsight I would not have been happy with the job
-anyways. Back to the topic: today how can we achieve that? Nowadays, we have
-some answers with Halide. And we can do it very elegantly without coding in
-assembly code directly.
+In hindsight I would not have been happy with the job anyways. End of the rant
+and back to the topic: today how can we achieve that? Nowadays, we have some
+answers with Halide. And we can do it very elegantly without coding in assembly
+code directly.
 
 ## What is Halide?
 
-With Halide you can write a an image processing filter and optimize for each
-specific hardware and architecture. All of this in a *very few lines of code*.
-Then Halide will compile the resulting code into usable assembly code.
+In a technical jargon, Halide is an domain specific language embedded in C++.
+The specific domain we are dealing is image processing. It is a real language
+because it compile your optimized algorithm into assembly code.
 
-The main beauty with Halide is that you can decouple:
+With Halide, you can write an image processing filter and optimize for each
+specific hardware and architecture. All of this elegangtly in a *very few lines
+of code*. Then Halide will compile the C++ code into usable assembly code.
 
-1. The algorithm: the separable convolution and,
-2. the scheduling strategy: the parallelism strategy to make it as fast as the
-baseline if not faster than OpenCV's Gaussian blur.
+The main beauty with Halide is that you decouple:
+
+1. the algorithm: in our case, the separable convolution and,
+2. the scheduling strategy: which exploits the different kinds of
+   parallelism that the hardware offers,
+
+so that our implementation is as fast as the baseline implementation. In our
+case, the baseline implementation is OpenCV's Gaussian blur.
 
 Halide can check and provide guarantees that your algorithm remains correct for
 the schedule you are implementing.
 
-With Halide, you won't write any nested `for` loops and multi-threaded
-processing with bound-checking. Instead you express these ideas at a higher
-level.
+Halide is trying to unify the common programming patterns from CPU to GPU
+programming in CUDA, OpenCL, Metal, Vulkan. Similarly to CUDA, an image
+processing algorithm can often expressed as a succesion of CUDA kernels.
+Therfore you don't need to write any nested loops.
 
 Halide abstracts the different kind of parallelisms for you and supports a wide
 range of platforms. You don't have to be an expert in CPU vector intrinsics, but
-you do need to know the schedule strategies to say optimize the speed at which
-your convolutional operations run.
-
-You still need to skim through the publications, the presentations and practise.
-But in my experience, it is still difficult for the layman or the novice to
-identify the schedule patterns that work and those that don't.
+you do need to know some scheduling strategies to best optimize the
+convolutional operation.
+When I began learning Halide, I knew very little about optimal schedules.
+Skimming through publications and the presentations by Halide authors was the
+only way for me to really learn. And of course practise, practise, practise. So
+all in all, in my experience, the entry barrier is still very high for the
+average programmer to identify which schedule patterns work best and those that
+don't.
 
 ### Halide vs Explicit CPU intrisic code
 
 Naysayers be like: "but Halide is too high level and does too much magic!"
 
-Optimizing algorithms by writing explicitly CPU intrinsic instructions can
-certainly be done. But you would have to pay a very costly engineering price.
-You have to optimize for different CPU platforms: x86-64, ARM, RISC-V and learn
-about their different intrinsic API. The resulting code is much harder to
-maintain than using a unified language that allows you to write these in a very
-lines of codes.
+Certainly! Optimizing algorithms by writing explicitly CPU intrinsic
+instructions can be done. But you would have to pay a very costly engineering
+price.
+
+You would have to optimize for different CPU platforms: x86-64, ARM, RISC-V and
+learn about their C API to utilize SIMD instructions. The resulting code would
+be much harder to maintain than using a unified language that allows you to
+write these in a very few lines of codes.
 
-Unless this is your full-time job and something you really want to learn, that's
-not something a computer vision scientist like me would like to spend time to...
-Halide has done an excellent job in abstracting this at the least on the CPU
-side. So know who you are and decide what you want to do.
+Unless this is your full-time job or it's something you really want to learn,
+personally I don't want to spend time into this. Halide has done an excellent
+job in abstracting this at the least on the CPU side. So know who you are and
+decide what you want to do.
 
 Let's conclude this paragraph with a few words regarding the GPU acceleration
 with Halide. On the GPU side, Halide is indeed not yet mature. For one thing,
@@ -92,7 +104,7 @@ writing. You will be much better off writing code in CUDA, Vulkan, or OpenCL to
 fully control your GPU.
 
 
-## Naive C++ Implementation of the Gaussian Blur
+## A Naive Implementation of the Gaussian Blur in C++
 
 Right, let's rewind back in time and imagine myself more than 15 years ago.
 Freshly graduated with a Master's degree looking to showcase some work
@@ -187,23 +199,40 @@ I am proudly showcasing my code on GitHub. Never complained about it as I
 bothered about real-time issues as later I learnt about in CUDA and would write
 in CUDA anyways.
 
+### Issues in the C++ code
+
+Let's enumerate some issues and ideas that we will address later on.
+
+1. The only parallelism we are using is the **multi-threading**.
+2. It is not clear how the **CPU vector intrinsics** can be applied in the convolution.
+   - For one thing, the boundary checking does not easily allow the compiler to
+   vectorize the C++ code.
+   - There is actually a better way to exploit the CPU vector instructions for
+     the convolutional operation and the C++ code does not do this way.
+3. **Data locality** is very important aspect, which we are not exploiting.
+
+It certainly wasn't the fault of my younger self who did not know any better.
+Let us now address these issues and ideas in Halide.
+
 
-## Halide Implementation of the Algorithm
+## A Very Fast Implementation in Halide
 
-Back to the time where the CTO and his minions tell you that you are not good
-enough without elaborating why and what he was expecting to see.
+As we said it earlier, Halide splits an algorithm into two parts:
 
-Fine! let's see how we could optimize the code...
+1. the implementation and
+2. the schedule for any algorithm.
 
-I vaguely understand we have to rewrite the code explicitly with CPU intrinsics?
-Should I write it in C-style or in ASM code? How to do it with limited bandwidth
-after finishing your day job and wanting to learn?
+We will now detail each part. This section will divided in 3 parts.
 
-Can we use Halide? Yes! It is a very elegant language. It can also compile your
-code directly in OpenCL and Vulkan, Direct3D bit code. Here is how we could
-rewrite the 2D separable convolution.
+1. First we implement the separable convolution in Halide.
+2. Then we explain the different kinds of parallelisms that the CPU offers.
+3. Finally we explain the schedule part of the algorithm in Halide and we will
+   explore a few different schedules.
 
-Let us write the implementation first.
+### The Algorithm
+
+First, let us write the implementation that exploits the separability of the
+Gaussian filter.
 
 ```{cpp}
 #include <Halide.h>
@@ -219,7 +248,11 @@ static constexpr auto truncation_factor = 4.f;
 static constexpr auto ksz = static_cast<int>(2 * sigma * truncation_factor);
 static constexpr auto kr = ksz / 2;
 
+// N.B.: we just need to define something for the input... but just think of it
+// as an image.
 input(x, y) = x + y;
+
+// The 1D Gaussian filter.
 kernel(x) = Halide::exp(-x * x / (0.5f * sigma * sigma));
 
 auto conv_x = Halide::Func{"conv_x"};
@@ -232,73 +265,92 @@ conv_x(x, y) = Halide::sum(input(x + k - kr, y) * kernel(k));
 conv_y(x, y) = Halide::sum(conv_x(x, y + k - kr / 2) * kernel(k));
 ```
 
-## Shedule Optimization
-
 ### Two Types of Parallelisms on CPUs
 
-There are two types of parallelisms on the CPU.
+There are two types of parallelisms on the CPU which we can exploit altogether.
 
 1. Multicore processing:
-   This is straightforward to understand and is about keep all the CPU cores as
-   busy as possible with minimal data sharing. OpenMP is simple and helps to
-   parallelize image filter quite easily once we identify the parts of the
-   algorithm that operate independently.
+
+   This is straightforward to understand. A CPU can be thought as a factory of
+   workers, each one of them being called a CPU core. The multicore processing
+   consists in keeping each CPU core as busy as possible with minimal data
+   sharing.
+
+   OpenMP is one implementation of multicore processing among others to
+   parallelise our image filter.
 
 2. Vector instructions:
-   Until I implemented filters with Halide, I could not understand what CPU
-   instrutions were really about.
-   I am not going to pretend to be an expert in CPU optimization but this little
-   paragraph should convince you why it is so interesting to apply vector
-   instructions wherever possible.
-   So, as a first approximation, a CPU vector instruction typically enables the
-   programmer to perform arithmetic operations on small vectors in a single CPU
-   cycle. Typically arithmetic operations such addition, multiplication and more
-   can operate on 4D vectors. That is where we can observe additional 4x speed
-   up or more if your CPU can process those operations on bigger vectors.
 
-For more accurate and more comprehensive information, I will simply encourage
-you to do your own research and share what you have learnt.
+   Until I implemented filters with Halide, I really did not understand what CPU
+   vector instrutions were really about.
+
+   Mainly, a CPU vector instruction enables a CPU core to perform arithmetic
+   operations on small vectors in a **single** CPU cycle.
+   That means that additions, subtractions, dot products on 4D float vectors can
+   be executed in a single CPU cycle instead of 4 CPU cycles (7 CPU cycles for
+   the dot product).
+
+   That is very significant on a very large scale as we can observe additional
+   4x speed up or more on very very large arrays and therefore image data.
 
-Like image filter, BLAS routines makes extensive use of the two types
-parallelism on CPU platforms.
+Nowadays, an Intel CPU that supports AVX-512 vector instructions can perform
+operations on 16D vectors of 32-bit floating point data in a single CPU
+instruction.
 
-### Schedule 1
+So when we combine parallelism 1 and 2 on an 12-core Intel CPU with AVX-512
+instructions, an optimized algorithm could theoretically be sped up by a factor
+of $12 \times 16 = 192$ on 32-bit floating point array. This is huge.
+
+Now, for more accurate and more comprehensive information, I will simply
+encourage you to do your own research on ARM, RISC-V CPUs, and share what you
+have learnt.
+
+Likewise, optimized linear algebra routines like OpenBLAS makes extensive use of
+the two types parallelism on CPU platforms.
+
+We are now moving to the schedule, which is the most difficult part of the
+implementation. We will explore 3 schedules with Halide.
+
+### Schedule 1: Naive Strategy
 
 The obvious strategy is to start from the idea of separable convolution and
 vectorize the convolution wherever possible.
 
 We parallelize the computation of image rows. We apply Halide's magic invocation
-to vectorize the convolution without really understanding why but thinks it
-works.
+to vectorize the convolution without trying to really understand why it works.
 
-```
+```{cpp}
+// Precompute the kernel all at once.
+//
+// This is irrelevant in the schedule but necessary when you want to understand
+// the assembly code generated by Halide.
 kernel.compute_root();
 
+// First, compute the x-convolution in a separate memory buffer and use CPU
+// vector instructions.
 conv_x
     .compute_root()
     .parallel(y)
     .vectorize(x, 32, Halide::TailStrategy::GuardWithIf)
     ;
+
+// Second, compute the y-convolution as a second step.
 conv_y
     .parallel(y)
-    .vectorize(x, 32, Halide::TailStrategy::GuardWithIf)
-    ;
+    .vectorize(x, 32, Halide::TailStrategy::GuardWithIf) ;
 ```
 
-Nice improvements over your naive implementation but then you decide to compete
-with OpenCV just to see that it still crushes your implementation by being 2x
-faster.
+Thanks to the CPU vectorization, this schedule is a nice improvement over the
+naive C++ implementation as we exploit the CPU data vectorization. Yet OpenCV's
+implementation is still better than this schedule by a factor 2.
 
-### Schedule 2: found in one Halide publication.
+We will worry about how the CPU vectorization is implemented later by examining
+a better schedule. Let's move on to a better schedule: schedule #2.
 
-In fact, the optimal schedule is really not obvious in CPU as exposed in Halide
-presentations. Until you dig into Halide publications, you start to understand
-how much work and expertise it is to optimize a typical image processing filter
-in Photoshop and took 3 months of hard work to correctly invoke CPU vector
-instructions.
+### Schedule 2: From a Publication
 
-After digging in the publications, I find this schedule in one of Halide's
-publications:
+In fact, the optimal schedule is really not obvious in CPU. After digging in the
+publications, I find this schedule in one of Halide publications:
 
 ```{cpp}
 auto xo = Halide::Var{"xo"};
@@ -313,21 +365,29 @@ conv_y.tile(x, y, xo, yo, xi, yi, 64, 64, Halide::TailStrategy::GuardWithIf)
     .vectorize(xi, 16, Halide::TailStrategy::GuardWithIf)
 ```
 
-This crushes OpenCV's implementation but I don't understand why.
+This literally crushes OpenCV's implementation but I didn't understand why at
+that time. However the main idea in this schedule is to better exploit data
+locality but splitting the image into smaller images.
+
+As argued in a presentation by Halide authors, without Halide, the optimal
+schedule is definitely not obvious and would have necessitated at least 3 months
+of work in order how to exploit CPU SIMD instructions and the data locality in
+the memory cache.
 
-The first step to achieve this to divide the final convolved image into tiles of
-64 by 64 pixels. The set of tiles can be seen as an input batch of many smaller
-images. The output is another batch of image tiles with the same image sizes
-(let's just assume that the image width and height are multiple of 64.)
+Specifically this schedule the final convolved image into square tiles of 64 by
+64 pixels. The set of input tiles can be seen as an input batch of many smaller
+images and processed in parallel. The output image is obtained by reassembling
+the batch of output image tiles.
 
+We will detail later how the vectorization of the convolution is done.
 
-### Schedule 3: A better one found by myself
+### Schedule 3: An Improved Version by Myself
 
 Because each output image tile is independent of each other, we can calculate
-smaller x-convolution and y-convolution. For each tile we can fit the
-x-convolution in the CPU cache and it improves data locality in the memory
-cache. Then we explot the CPU vector instructions to calculate convolution in
-batch.
+smaller x-convolution and y-convolution that can fit in the memory cache. For
+each tile we can fit the x-convolution in the CPU cache and it improves data
+locality in the memory cache. Then we explot the CPU vector instructions to
+calculate convolution in batch.
 
 
 ```{cpp}
@@ -340,8 +400,7 @@ auto yi = Halide::Var{"yi"};
 auto tile = Halide::Var{"tile"};
 
 // Precalculate the kernel.
-// We want this and this will avoid cluttering Halide's compiled statement later
-// on.
+// We want this to avoid cluttering Halide's compiled statement later on.
 kernel.compute_root();
 
 // The schedule
@@ -357,7 +416,7 @@ conv_x
     ;
 ```
 
-#### Second-Guessing what Halide does
+### Second-Guessing what Halide does
 
 There is a lot to unpack here. Let's try to break it down bit by bit the
 schedule below:
@@ -371,12 +430,11 @@ schedule below:
 
 According to how I understand it:
 ```{cpp}
-#pragma omp parallel for  // .parallel(y)
+#pragma omp parallel for
 for (auto tile_index = 0; tile_index < T; ++T)
 {
-  // Process the tile (xo, yo).
-  const auto yo = tile_index / T;
-  const auto xo = tile_index % T;
+  // Process the tile (xo, yo). const auto yo = tile_index / T; const auto xo =
+  tile_index % T;
 
   for (auto yi = 0; yi < 64; ++yi)
   {
@@ -390,30 +448,26 @@ for (auto tile_index = 0; tile_index < T; ++T)
       const auto x = xo * 64 + xi;
 
 
-      // Inferring from:
+      // compute_at(...) allocates memory on the stack as explained in:
       // https://halide-lang.org/docs/class_halide_1_1_func.html#a800cbcc3ca5e3d3fa1707f6e1990ec83.
       //
-      // Halide must allocate some storage on the stack
-      //
       // I understand that the storage **has** to be 2D, if we want to calculate
       // conv_y to ensure maximum data locality.
       float conv_x[ksz][ksz];
 
-      // TODO: verify this.
-      //
-      // This is trivially vectorizable.
+      // Trivially vectorizable.
       for (auto k = 0; k < ksz; ++k)
         conv_x[yi][xi] = 0.f;
 
-      // Vectorizable in xi? I don't understand how.
+      // Vectorizable in xi? How?
       for (auto k = 0; k < ksz; ++k)
         conv_x[yi][xi] += in[y][x + k] * kernel[k];
 
-      // Vectorizable in xi? I don't understand how.
+      // Vectorizable in xi? How?
       for (auto k = 0; k < ksz; ++ksz)
         conv[y][x] = 0.f;
 
-      // Vectorizable in xi? I don't understand how.
+      // Vectorizable in xi? How?
       for (auto k = 0; k < ksz; ++ksz)
         conv_y[y][x] += conv_x[yi+k][xi] * kernel[k];
     }
@@ -421,16 +475,16 @@ for (auto tile_index = 0; tile_index < T; ++T)
 }
 ```
 
-I still don't understand how the convolutions are vectorized in the variable
+It is not yet clear to me how the convolutions are vectorized in the variable
 `xi`. This is the goal of the next paragraph.
 
 ### Understanding with Halide Compiled Statement
 
-OK the second-guessing turns out to be not that bad but we really understand
-what Halide does for us.
+The second-guessing turns out to be not that bad but we really need to
+understand what Halide does for us. How is the CPU vectorization done?
 
-To get a more definite answer, let's pluck up the courage to actually inspect
-the assembly code generated by Halide.
+To get a more definite answer, let's actually inspect the assembly code
+generated by Halide.
 
 ```{cpp}
 conv_y.compile_to_stmt(
@@ -481,7 +535,7 @@ style="border: 1px solid #464646;" allowfullscreen>
 </iframe>
 
 It is not practical to view the embedded HTML page, so go to this <a
-href="./random/separable_conv_2d.stmt.html">link</a> below to fully explore the
+href="./random/separable_conv_2d.stmt.html">link</a> to fully explore the
 compiled statement as a full page:
 
 

From 0d57d836eadba3b790b4cef57304423577925674 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Mon, 1 Apr 2024 21:23:02 +0100
Subject: [PATCH 07/49] DOC: fix code.

---
 doc/book/random/vector_intrinsics.Rmd | 35 ++++++++++++++-------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/doc/book/random/vector_intrinsics.Rmd b/doc/book/random/vector_intrinsics.Rmd
index b658348b7..18f38d72b 100644
--- a/doc/book/random/vector_intrinsics.Rmd
+++ b/doc/book/random/vector_intrinsics.Rmd
@@ -384,10 +384,10 @@ We will detail later how the vectorization of the convolution is done.
 ### Schedule 3: An Improved Version by Myself
 
 Because each output image tile is independent of each other, we can calculate
-smaller x-convolution and y-convolution that can fit in the memory cache. For
-each tile we can fit the x-convolution in the CPU cache and it improves data
-locality in the memory cache. Then we explot the CPU vector instructions to
-calculate convolution in batch.
+their x-convolution and y-convolution that can fit in the memory cache. For each
+tile we can fit the x-convolution in the CPU cache and it improves data locality
+in the memory cache. Then we exploit the CPU vector instructions to calculate
+convolution in batch.
 
 
 ```{cpp}
@@ -416,7 +416,7 @@ conv_x
     ;
 ```
 
-### Second-Guessing what Halide does
+### Second-Guessing What Halide Does
 
 There is a lot to unpack here. Let's try to break it down bit by bit the
 schedule below:
@@ -426,6 +426,7 @@ schedule below:
       .fuse(xo, yo, tile_index)
       .parallel(tile_index);
       .vectorize(xi, 16, Halide::TailStrategy::GuardWithIf)
+      ;
 ```
 
 According to how I understand it:
@@ -541,27 +542,27 @@ compiled statement as a full page:
 
 ### Vectorizing in the Convolution
 
-Eventually, this was not so hard and this gives a very tangible hands-on
-introduction to assembly code.
+Upon inspection of the compiled statement, it turns out that the convolution
+operation is implemented by batch where we calculate 4, 8 or 16 convolved values
+at the same time by repeating the vectorized fused multiply-add `fmla.4s`
+operation.
 
-Then it becomes clear that the convolution operation is implemented by batch
-where we calculate 4, 8 or 16 convolved values at the same time by repeating
-the vectorized fused multiply-add `fmla.4s` operation.
+For example, the vectorization the x-convolution can be translated equivalently
+in NumPy code as
 
-The vectorization the x-convolution can be translated in terms of NumPy equivalent code
-
-```
+```{python}
 import numpy as np
 
-def convolve_vectorized(conv_x, in, kernel, tile, xi, yi):
-    # Trivially vectorizable initialization.
+def convolve_vectorized(conv_x, input, kernel, tile, xi, yi):
+    # Trivially vectorized initialization.
     conv_x[tile, yi, xi:xi+ksz] = 0
 
     # Repeat the fused multiply-add operation as follows.
     ksz = kernel.shape[0]
     for k in range(ksz):
-        conv_x[tile, yi, xi:xi+32] = conv_x[tile, yi, xi:xi+32] \
-                                   + in[tile, yi, xi:xi+32] * kernel[k]
+        conv_x[tile, yi, xi: xi+ 32] = \
+            conv_x[tile, yi, xi:xi + 32] + \
+            input[tile, yi, xi + k:xi + 32 + k] * kernel[k]
 ```
 
 

From 1e3ef2a60b34413d14f03163387baa96130fd42d Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Wed, 3 Apr 2024 13:20:29 +0100
Subject: [PATCH 08/49] MAINT: update swift compiler and docker images.

---
 build.py          |  4 ++--
 docker/Dockerfile | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/build.py b/build.py
index 59507912f..66a791a88 100755
--- a/build.py
+++ b/build.py
@@ -27,8 +27,8 @@
 UBUNTU_VERSION = "22.04"
 CUDA_VERSION = "12.1.0"
 TRT_VERSION = "8.6"
-SWIFT_VERSION = "5.9.1"
-HALIDE_VERSION = "17.0.0"
+SWIFT_VERSION = "5.10"
+HALIDE_VERSION = "17.0.1"
 
 # Docker
 SARA_SOURCE_DIR = pathlib.Path(__file__).parent.resolve()
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 862d826c1..052be2c91 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -75,14 +75,14 @@ RUN pip3 install \
       pybind11
 
 # Install Swift toolchain.
-RUN wget https://download.swift.org/swift-5.9.1-release/ubuntu2204/swift-5.9.1-RELEASE/swift-5.9.1-RELEASE-ubuntu22.04.tar.gz
-RUN tar xvzf swift-5.9.1-RELEASE-ubuntu22.04.tar.gz  \
-      && mv swift-5.9.1-RELEASE-ubuntu22.04 /opt
+RUN wget https://download.swift.org/swift-5.10-release/ubuntu2204/swift-5.10-RELEASE/swift-5.10-RELEASE-ubuntu22.04.tar.gz
+RUN tar xvzf swift-5.10-RELEASE-ubuntu22.04.tar.gz  \
+      && mv swift-5.10-RELEASE-ubuntu22.04 /opt
 
 # Install Halide.
-RUN wget https://github.com/halide/Halide/releases/download/v16.0.0/Halide-16.0.0-x86-64-linux-1e963ff817ef0968cc25d811a25a7350c8953ee6.tar.gz
-RUN tar xvzf Halide-16.0.0-x86-64-linux-1e963ff817ef0968cc25d811a25a7350c8953ee6.tar.gz && \
-      mv Halide-16.0.0-x86-64-linux /opt
+RUN wget https://github.com/halide/Halide/releases/download/v17.0.1/Halide-17.0.1-x86-64-linux-52541176253e74467dabc42eeee63d9a62c199f6.tar.gz
+RUN tar xvzf Halide-17.0.1-x86-64-linux-52541176253e74467dabc42eeee63d9a62c199f6.tar.gz && \
+      mv Halide-17.0.1-x86-64-linux /opt
 
 # Please make my life easier
 # TODO: install neovim, etc.

From c325a692b3c141ce9ae6e3e7638c0dbab3902ead Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Wed, 3 Apr 2024 13:24:02 +0100
Subject: [PATCH 09/49] MAINT: update docker image.

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6a0db6f3a..5fcb971a2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -18,7 +18,7 @@ jobs:
         # os:  [macos-latest, windows-latest]
         include:
           - os: ubuntu-latest
-            container: 'oddkiva/sara-devel:cuda12.1.0-ubuntu22.04-trt8.6-swift5.9.1-halide16.0.0'
+            container: 'oddkiva/sara-devel:cuda12.1.0-ubuntu22.04-trt8.6-swift5.10-halide17.0.1'
           - os: ubuntu-latest
             container: 'oddkiva/sara-emsdk-devel:latest'
 

From 29cf35c7410d8e0ee51ebf84f9ebd4bf871da69a Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Wed, 3 Apr 2024 17:17:24 +0100
Subject: [PATCH 10/49] DOC: reword.

---
 doc/book/random/vector_intrinsics.Rmd | 107 ++++++++++++++------------
 1 file changed, 59 insertions(+), 48 deletions(-)

diff --git a/doc/book/random/vector_intrinsics.Rmd b/doc/book/random/vector_intrinsics.Rmd
index 18f38d72b..b8df9152c 100644
--- a/doc/book/random/vector_intrinsics.Rmd
+++ b/doc/book/random/vector_intrinsics.Rmd
@@ -12,40 +12,41 @@ Story time before we dive into the implementation of the Gaussian blur.
 Once, I applied for a C++ technical leadership role for some company. I did a
 behavioral interview with their CTO. He then told me who I would be interviewing
 with in the next interviews. A few days later, he called me back for a quick
-chat. He ended up telling me they would not move forward after examining sara's
-image processing code. Without really explaining why, the most likely reason I
-could guess was that one of their senior engineers disqualified me because all
-he cared about was whether I could understand and write code with CPU vector
-instructions.
+chat. He ended up telling me they would not move forward after examining
+*Sara*'s image processing code. Without really explaining why, the most likely
+reason I could come up with was that one of their senior engineers disqualified
+me because all he cared about was whether I could understand and write code with
+CPU vector instructions.
 
 The CTO profusedly apologized to me. He said politely that I certainly was
 gifted but my code was not up to their standard. They must have deemed that my
 image processing code was implemented too naively.
 
-During the behavioral interview, the CTO told me they were doing lots of
-optimization involving CPU vector intrinsics in order to calculate data as fast
-as possible. That it really was not that difficult and that I could get up to
-speed fairly quickly. That was the main reason I believe they disqualified me.
+Indeed I remembered that during the behavioral interview, the CTO told me they
+were doing lots of optimization involving CPU vector instructions in order to
+process data as fast as possible. That it really was not that difficult and that
+I could get up to speed fairly quickly, and blah blah blah... That was the only
+reason for which, I believe, they disqualified me.
 
 Having mostly an applied math background, it did sound unfair and hypocritical
 to me. Hypocritical because, if it really was easy, then why can you not learn
-on the job?
-So it did make me feel that you are never enough whatever you achieve in life.
+it on the job?
+So, yes, it did make me feel that you are never enough whatever you achieve in
+life.
 Oh, so you're supposed to master every single damn thing of software engineering
-from high level to low level when you start a job?
+when you start a job?
 Going by the same logic, if David Lowe showcased his SIFT research code, he
 would not qualify for the job either since I learnt from studying his code.
 
 In hindsight I would not have been happy with the job anyways. End of the rant
 and back to the topic: today how can we achieve that? Nowadays, we have some
-answers with Halide. And we can do it very elegantly without coding in assembly
-code directly.
+answers with Halide, which allows us to do it very elegantly.
 
 ## What is Halide?
 
 In a technical jargon, Halide is an domain specific language embedded in C++.
-The specific domain we are dealing is image processing. It is a real language
-because it compile your optimized algorithm into assembly code.
+The specific domain we are dealing is image processing. It is a language because
+it compiles your optimized algorithm into assembly code.
 
 With Halide, you can write an image processing filter and optimize for each
 specific hardware and architecture. All of this elegangtly in a *very few lines
@@ -60,18 +61,16 @@ The main beauty with Halide is that you decouple:
 so that our implementation is as fast as the baseline implementation. In our
 case, the baseline implementation is OpenCV's Gaussian blur.
 
-Halide can check and provide guarantees that your algorithm remains correct for
-the schedule you are implementing.
+Halide is trying to unify common programming patterns that arise from CPU to GPU
+programming in CUDA, OpenCL, Metal, Vulkan. In Halide, an image processing
+algorithm can often expressed as a succesion of CUDA kernels. Therefore you
+don't need to write any nested loops.
 
-Halide is trying to unify the common programming patterns from CPU to GPU
-programming in CUDA, OpenCL, Metal, Vulkan. Similarly to CUDA, an image
-processing algorithm can often expressed as a succesion of CUDA kernels.
-Therfore you don't need to write any nested loops.
+Halide abstracts different kinds of parallelism and supports a wide range of
+platforms. While we don't have to know about the C API to call the vector
+instructions, we still need to know the common programming patterns that
+involves CPU vector instructions.
 
-Halide abstracts the different kind of parallelisms for you and supports a wide
-range of platforms. You don't have to be an expert in CPU vector intrinsics, but
-you do need to know some scheduling strategies to best optimize the
-convolutional operation.
 When I began learning Halide, I knew very little about optimal schedules.
 Skimming through publications and the presentations by Halide authors was the
 only way for me to really learn. And of course practise, practise, practise. So
@@ -81,11 +80,10 @@ don't.
 
 ### Halide vs Explicit CPU intrisic code
 
-Naysayers be like: "but Halide is too high level and does too much magic!"
+Naysayers will argue: "But Halide is too high level and does too much magic!"
 
-Certainly! Optimizing algorithms by writing explicitly CPU intrinsic
-instructions can be done. But you would have to pay a very costly engineering
-price.
+Certainly, optimizing algorithms by writing explicitly CPU vector instructions
+can be done. But we would have to pay a very costly engineering price.
 
 You would have to optimize for different CPU platforms: x86-64, ARM, RISC-V and
 learn about their C API to utilize SIMD instructions. The resulting code would
@@ -117,7 +115,7 @@ And just make things work.
 I did learn in class that the Gaussian filter is separable: Don't write the 2D
 convolution naively, exploit its separable property. Icing on the cake, I have
 discovered multicore parallelism via OpenMP. I am proudly exhibiting a naive
-implementation in C++ in my Sara repo, something along the lines below.
+implementation in C++ in my *Sara* repo, something along the lines below.
 
 Below is the first step, that is the x-convolution.
 ```{Rcpp}
@@ -272,9 +270,11 @@ There are two types of parallelisms on the CPU which we can exploit altogether.
 1. Multicore processing:
 
    This is straightforward to understand. A CPU can be thought as a factory of
-   workers, each one of them being called a CPU core. The multicore processing
-   consists in keeping each CPU core as busy as possible with minimal data
-   sharing.
+   workers, each one of them being called a CPU core.
+
+   The multicore processing becomes most effective when each CPU core is being
+   kept as busy as possible and the CPU cores don't need to communicate with
+   each other via data synchronization.
 
    OpenMP is one implementation of multicore processing among others to
    parallelise our image filter.
@@ -298,16 +298,25 @@ operations on 16D vectors of 32-bit floating point data in a single CPU
 instruction.
 
 So when we combine parallelism 1 and 2 on an 12-core Intel CPU with AVX-512
-instructions, an optimized algorithm could theoretically be sped up by a factor
-of $12 \times 16 = 192$ on 32-bit floating point array. This is huge.
+instructions, an optimized algorithm could in principle be sped up by a factor
+of
+
+\begin{equation}
+  \#\{\text{CPUs}\} \times \text{SIMD max dim} = 12 \times 16 = 192
+\end{equation}
 
-Now, for more accurate and more comprehensive information, I will simply
-encourage you to do your own research on ARM, RISC-V CPUs, and share what you
-have learnt.
+on single-precision floating point data.
+
+While we should instead use Amdahl's law instead of this naive formula, applying
+this formula is not so simple and we will be content to just benchmark and
+measure the performance gain.
 
 Likewise, optimized linear algebra routines like OpenBLAS makes extensive use of
 the two types parallelism on CPU platforms.
 
+For more accurate and more comprehensive information, I will simply encourage
+you to do your own research on ARM, RISC-V CPUs, and share what you have learnt.
+
 We are now moving to the schedule, which is the most difficult part of the
 implementation. We will explore 3 schedules with Halide.
 
@@ -340,17 +349,18 @@ conv_y
     .vectorize(x, 32, Halide::TailStrategy::GuardWithIf) ;
 ```
 
-Thanks to the CPU vectorization, this schedule is a nice improvement over the
-naive C++ implementation as we exploit the CPU data vectorization. Yet OpenCV's
-implementation is still better than this schedule by a factor 2.
+This schedule is a nice improvement over the naive C++ implementation as we
+exploit the CPU data vectorization. Yet OpenCV's implementation is still better
+than this schedule by a factor 2.
 
-We will worry about how the CPU vectorization is implemented later by examining
-a better schedule. Let's move on to a better schedule: schedule #2.
+Later we will examine how the CPU vectorization is implemented by inspecting a
+better schedule. So let's move on to a better schedule: schedule #2.
 
 ### Schedule 2: From a Publication
 
-In fact, the optimal schedule is really not obvious in CPU. After digging in the
-publications, I find this schedule in one of Halide publications:
+The optimal schedule is really not obvious in CPU, at least for me. After
+digging in the publications, I found this schedule in one of Halide
+publications:
 
 ```{cpp}
 auto xo = Halide::Var{"xo"};
@@ -359,7 +369,8 @@ auto yo = Halide::Var{"yo"};
 auto xi = Halide::Var{"xi"};
 auto yi = Halide::Var{"yi"};
 
-conv_y.tile(x, y, xo, yo, xi, yi, 64, 64, Halide::TailStrategy::GuardWithIf)
+conv_y.tile(x, y, xo, yo, xi, yi, 64, 64,
+            Halide::TailStrategy::GuardWithIf)
     .fuse(xo, yo, tile_index)
     .parallel(tile_index);
     .vectorize(xi, 16, Halide::TailStrategy::GuardWithIf)
@@ -554,7 +565,7 @@ in NumPy code as
 import numpy as np
 
 def convolve_vectorized(conv_x, input, kernel, tile, xi, yi):
-    # Trivially vectorized initialization.
+    # Trivially vectorized.
     conv_x[tile, yi, xi:xi+ksz] = 0
 
     # Repeat the fused multiply-add operation as follows.

From c453fe80dfd7b9cd2c8f8f03500ad8619eca3c97 Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Wed, 3 Apr 2024 19:18:54 +0100
Subject: [PATCH 11/49] DOC: reword.

---
 doc/book/random/vector_intrinsics.Rmd | 94 +++++++++++++++------------
 1 file changed, 53 insertions(+), 41 deletions(-)

diff --git a/doc/book/random/vector_intrinsics.Rmd b/doc/book/random/vector_intrinsics.Rmd
index b8df9152c..c24e2df62 100644
--- a/doc/book/random/vector_intrinsics.Rmd
+++ b/doc/book/random/vector_intrinsics.Rmd
@@ -54,29 +54,28 @@ of code*. Then Halide will compile the C++ code into usable assembly code.
 
 The main beauty with Halide is that you decouple:
 
-1. the algorithm: in our case, the separable convolution and,
-2. the scheduling strategy: which exploits the different kinds of
-   parallelism that the hardware offers,
+1. the **algorithm**: in our case, the **separable convolution** and,
+2. the **schedule**: which exploits the different **kinds of parallelism** that
+   the hardware offers,
 
 so that our implementation is as fast as the baseline implementation. In our
 case, the baseline implementation is OpenCV's Gaussian blur.
 
-Halide is trying to unify common programming patterns that arise from CPU to GPU
-programming in CUDA, OpenCL, Metal, Vulkan. In Halide, an image processing
-algorithm can often expressed as a succesion of CUDA kernels. Therefore you
+Halide abstracts and unifies the **common programming patterns** that arise from
+CPU to GPU programming in CUDA, OpenCL, Metal, Vulkan. Besides, an image
+processing algorithm can often expressed as a sequence of CUDA kernels, you
 don't need to write any nested loops.
 
 Halide abstracts different kinds of parallelism and supports a wide range of
-platforms. While we don't have to know about the C API to call the vector
-instructions, we still need to know the common programming patterns that
-involves CPU vector instructions.
+platforms. While we don't have to know the C API to invoke vector instructions,
+we still need to know those common programming patterns that involves CPU vector
+instructions.
 
 When I began learning Halide, I knew very little about optimal schedules.
 Skimming through publications and the presentations by Halide authors was the
 only way for me to really learn. And of course practise, practise, practise. So
 all in all, in my experience, the entry barrier is still very high for the
-average programmer to identify which schedule patterns work best and those that
-don't.
+average programmer to identify which schedules work best and those that don't.
 
 ### Halide vs Explicit CPU intrisic code
 
@@ -85,10 +84,10 @@ Naysayers will argue: "But Halide is too high level and does too much magic!"
 Certainly, optimizing algorithms by writing explicitly CPU vector instructions
 can be done. But we would have to pay a very costly engineering price.
 
-You would have to optimize for different CPU platforms: x86-64, ARM, RISC-V and
-learn about their C API to utilize SIMD instructions. The resulting code would
-be much harder to maintain than using a unified language that allows you to
-write these in a very few lines of codes.
+You would have to optimize for different CPU platforms: *x86-64*, *ARM*,
+*RISC-V* and learn their respective C API to utilize hardware SIMD instructions.
+The resulting code would be much lengthier and harder to maintain than using a
+unified language that allows you to write these in a very few lines of codes.
 
 Unless this is your full-time job or it's something you really want to learn,
 personally I don't want to spend time into this. Halide has done an excellent
@@ -117,8 +116,12 @@ convolution naively, exploit its separable property. Icing on the cake, I have
 discovered multicore parallelism via OpenMP. I am proudly exhibiting a naive
 implementation in C++ in my *Sara* repo, something along the lines below.
 
+### The x-convolution
+
 Below is the first step, that is the x-convolution.
 ```{Rcpp}
+#include <algorithm>
+
 auto conv_x(const float *in, const float *kernel, float *out,
             const int w, const int h, const int ksz) -> void
 {
@@ -132,15 +135,12 @@ auto conv_x(const float *in, const float *kernel, float *out,
     for (auto x = 0; x < w; ++x)
     {
       auto val = 0.f;
+
+      // Calculate the convolved value.
       for (auto k = 0; k < ksz; ++k)
       {
-        auto xk = x - r + k;
-
         // Check the boundary conditions.
-        if (xk < 0)
-          xk = 0;
-        else if (xk >= w)
-          xk = w - 1;
+        const auto xk = std::clamp(x - r + k, 0, w - 1);
 
         // Accumulate.
         const auto xy = y * w + xk;
@@ -153,9 +153,13 @@ auto conv_x(const float *in, const float *kernel, float *out,
 }
 ```
 
-Then we perform the same dance for the y-convolution:
+### The y-convolution
+
+Then we perform the same dance for the y-convolution in a similar fashion.
 
 ```{Rcpp}
+#include <algorithm>
+
 auto conv_y(const float *in, const float *kernel, float *out,
             const int w, const int h, const int ksz) -> void
 {
@@ -169,15 +173,12 @@ auto conv_y(const float *in, const float *kernel, float *out,
     for (auto x = 0; x < w; ++x)
     {
       auto val = 0.f;
+
+      // Calculate the convolved value.
       for (auto k = 0; k < ksz; ++k)
       {
-        auto yk = y - r + k;
-
         // Check the boundary conditions.
-        if (yk < 0)
-          yk = 0;
-        else if (yk >= h)
-          yk = h - 1;
+        const auto yk = std::clamp(y - r + k, 0, w - 1);
 
         // Accumulate.
         const auto xy = yk * w + x;
@@ -193,21 +194,27 @@ auto conv_y(const float *in, const float *kernel, float *out,
 I diligently write unit tests, validate on synthetic tests, check the boundary
 conditions and try it on a real image with a Gaussian kernel. I am happy it
 works reasonably fast when compiling in Release mode on Visual Studio. Job done!
-I am proudly showcasing my code on GitHub. Never complained about it as I
-bothered about real-time issues as later I learnt about in CUDA and would write
-in CUDA anyways.
+I am proudly showcasing my code on GitHub. Never complained about it as I never
+bothered about real-time considerations. Instead I would port the code to make
+the algorithm run faster.
 
 ### Issues in the C++ code
 
 Let's enumerate some issues and ideas that we will address later on.
 
 1. The only parallelism we are using is the **multi-threading**.
-2. It is not clear how the **CPU vector intrinsics** can be applied in the convolution.
+
+   Each row is processed in parallel but can we do better than parallel row
+   processing?
+
+2. It is not clear how the **CPU vector instructions** can be applied in the
+   convolution.
    - For one thing, the boundary checking does not easily allow the compiler to
    vectorize the C++ code.
    - There is actually a better way to exploit the CPU vector instructions for
      the convolutional operation and the C++ code does not do this way.
-3. **Data locality** is very important aspect, which we are not exploiting.
+3. **Data locality** is a very important aspect, which the parallel row
+   processing does not easily allow to leverage.
 
 It certainly wasn't the fault of my younger self who did not know any better.
 Let us now address these issues and ideas in Halide.
@@ -308,7 +315,7 @@ of
 on single-precision floating point data.
 
 While we should instead use Amdahl's law instead of this naive formula, applying
-this formula is not so simple and we will be content to just benchmark and
+Amdahl's law is not so simple and we will be content to just benchmark and
 measure the performance gain.
 
 Likewise, optimized linear algebra routines like OpenBLAS makes extensive use of
@@ -369,6 +376,8 @@ auto yo = Halide::Var{"yo"};
 auto xi = Halide::Var{"xi"};
 auto yi = Halide::Var{"yi"};
 
+auto tile_index = Halide::Var{"t"};
+
 conv_y.tile(x, y, xo, yo, xi, yi, 64, 64,
             Halide::TailStrategy::GuardWithIf)
     .fuse(xo, yo, tile_index)
@@ -433,7 +442,8 @@ There is a lot to unpack here. Let's try to break it down bit by bit the
 schedule below:
 
 ```{cpp}
-  conv_y.tile(x, y, xo, yo, xi, yi, 64, 64, Halide::TailStrategy::GuardWithIf)
+  conv_y.tile(x, y, xo, yo, xi, yi, 64, 64,
+              Halide::TailStrategy::GuardWithIf)
       .fuse(xo, yo, tile_index)
       .parallel(tile_index);
       .vectorize(xi, 16, Halide::TailStrategy::GuardWithIf)
@@ -445,8 +455,9 @@ According to how I understand it:
 #pragma omp parallel for
 for (auto tile_index = 0; tile_index < T; ++T)
 {
-  // Process the tile (xo, yo). const auto yo = tile_index / T; const auto xo =
-  tile_index % T;
+  // Process the tile (xo, yo).
+  const auto yo = tile_index / T;
+  const auto xo = tile_index % T;
 
   for (auto yi = 0; yi < 64; ++yi)
   {
@@ -531,11 +542,12 @@ parallelisms in the schedule:
      // vectorize(xi, 32, GuardWithIf);
    ```
 
-This will help us to map out mentally the algorithmic roadmap and understand how
-the HTML visually maps out the **three-way** correspondence between:
-1. each part of the pseudo-code
+This will help us to break down the algorithmic roadmap and understand how the
+HTML visually maps out the **three-way** correspondence between:
+
+1. each part of the pseudo-code,
 2. each diagram block and
-3. each of the assembly code.
+3. each part of the assembly code.
 
 After a bit of inspection, by reinstating progressively all the different
 parallelisms, we start to understand how the parallel patterns are implemented

From c64fe4c65ce68970ea2bfc138f74b71c4c646118 Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Wed, 3 Apr 2024 19:20:02 +0100
Subject: [PATCH 12/49] DOC: reformat code.

---
 doc/book/random/vector_intrinsics.Rmd | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/book/random/vector_intrinsics.Rmd b/doc/book/random/vector_intrinsics.Rmd
index c24e2df62..2ff98df46 100644
--- a/doc/book/random/vector_intrinsics.Rmd
+++ b/doc/book/random/vector_intrinsics.Rmd
@@ -424,15 +424,15 @@ auto tile = Halide::Var{"tile"};
 kernel.compute_root();
 
 // The schedule
-conv_y  //
+conv_y
     .tile(x, y, xo, yo, xi, yi, 64, 64)
     .fuse(xo, yo, tile)
     .parallel(tile)
-    .vectorize(xi, 32, Halide::TailStrategy::GuardWithIf)  //
+    .vectorize(xi, 32, Halide::TailStrategy::GuardWithIf)
     ;
 conv_x
-    .compute_at(conv_y, xi)                               //
-    .vectorize(x, 32, Halide::TailStrategy::GuardWithIf)  //
+    .compute_at(conv_y, xi)
+    .vectorize(x, 32, Halide::TailStrategy::GuardWithIf)
     ;
 ```
 

From bb4c915a5f1fd06fa0372c5d7b9fdd6590195d21 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 4 Apr 2024 01:39:38 +0000
Subject: [PATCH 13/49] Bump vite from 4.5.2 to 4.5.3 in /svelte/sara-app

Bumps [vite](https://github.com/vitejs/vite/tree/HEAD/packages/vite) from 4.5.2 to 4.5.3.
- [Release notes](https://github.com/vitejs/vite/releases)
- [Changelog](https://github.com/vitejs/vite/blob/v4.5.3/packages/vite/CHANGELOG.md)
- [Commits](https://github.com/vitejs/vite/commits/v4.5.3/packages/vite)

---
updated-dependencies:
- dependency-name: vite
  dependency-type: direct:development
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 svelte/sara-app/package.json   |  2 +-
 svelte/sara-app/pnpm-lock.yaml | 36 +++++++++++++++++-----------------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/svelte/sara-app/package.json b/svelte/sara-app/package.json
index 4c2fdbaf8..792f8e3d0 100644
--- a/svelte/sara-app/package.json
+++ b/svelte/sara-app/package.json
@@ -20,7 +20,7 @@
 		"svelte-check": "^3.0.1",
 		"tslib": "^2.4.1",
 		"typescript": "^5.0.0",
-		"vite": "^4.5.2",
+		"vite": "^4.5.3",
     "autoprefixer": "^10.4.14",
     "daisyui": "^3.0.2",
     "postcss": "^8.4.31",
diff --git a/svelte/sara-app/pnpm-lock.yaml b/svelte/sara-app/pnpm-lock.yaml
index 5885f4d62..f45e45005 100644
--- a/svelte/sara-app/pnpm-lock.yaml
+++ b/svelte/sara-app/pnpm-lock.yaml
@@ -10,7 +10,7 @@ devDependencies:
     version: 2.1.0(@sveltejs/kit@1.20.2)
   '@sveltejs/kit':
     specifier: ^1.5.0
-    version: 1.20.2(svelte@3.59.1)(vite@4.5.2)
+    version: 1.20.2(svelte@3.59.1)(vite@4.5.3)
   autoprefixer:
     specifier: ^10.4.14
     version: 10.4.14(postcss@8.4.31)
@@ -42,8 +42,8 @@ devDependencies:
     specifier: ^5.0.0
     version: 5.1.3
   vite:
-    specifier: ^4.5.2
-    version: 4.5.2
+    specifier: ^4.5.3
+    version: 4.5.3
 
 packages:
 
@@ -314,11 +314,11 @@ packages:
     peerDependencies:
       '@sveltejs/kit': ^1.0.0
     dependencies:
-      '@sveltejs/kit': 1.20.2(svelte@3.59.1)(vite@4.5.2)
+      '@sveltejs/kit': 1.20.2(svelte@3.59.1)(vite@4.5.3)
       import-meta-resolve: 3.0.0
     dev: true
 
-  /@sveltejs/kit@1.20.2(svelte@3.59.1)(vite@4.5.2):
+  /@sveltejs/kit@1.20.2(svelte@3.59.1)(vite@4.5.3):
     resolution: {integrity: sha512-MtR1i+HtmYWcRgtubw1GQqT/+CWXL/z24PegE0xYAdObbhdr7YtEfmoe705D/JZMtMmoPXrmSk4W0MfL5A3lYw==}
     engines: {node: ^16.14 || >=18}
     hasBin: true
@@ -327,7 +327,7 @@ packages:
       svelte: ^3.54.0 || ^4.0.0-next.0
       vite: ^4.0.0
     dependencies:
-      '@sveltejs/vite-plugin-svelte': 2.4.1(svelte@3.59.1)(vite@4.5.2)
+      '@sveltejs/vite-plugin-svelte': 2.4.1(svelte@3.59.1)(vite@4.5.3)
       '@types/cookie': 0.5.1
       cookie: 0.5.0
       devalue: 4.3.2
@@ -341,12 +341,12 @@ packages:
       svelte: 3.59.1
       tiny-glob: 0.2.9
       undici: 5.22.1
-      vite: 4.5.2
+      vite: 4.5.3
     transitivePeerDependencies:
       - supports-color
     dev: true
 
-  /@sveltejs/vite-plugin-svelte-inspector@1.0.2(@sveltejs/vite-plugin-svelte@2.4.1)(svelte@3.59.1)(vite@4.5.2):
+  /@sveltejs/vite-plugin-svelte-inspector@1.0.2(@sveltejs/vite-plugin-svelte@2.4.1)(svelte@3.59.1)(vite@4.5.3):
     resolution: {integrity: sha512-Cy1dUMcYCnDVV/hPLXa43YZJ2jGKVW5rA0xuNL9dlmYhT0yoS1g7+FOFSRlgk0BXKk/Oc7grs+8BVA5Iz2fr8A==}
     engines: {node: ^14.18.0 || >= 16}
     peerDependencies:
@@ -354,30 +354,30 @@ packages:
       svelte: ^3.54.0 || ^4.0.0-next.0
       vite: ^4.0.0
     dependencies:
-      '@sveltejs/vite-plugin-svelte': 2.4.1(svelte@3.59.1)(vite@4.5.2)
+      '@sveltejs/vite-plugin-svelte': 2.4.1(svelte@3.59.1)(vite@4.5.3)
       debug: 4.3.4
       svelte: 3.59.1
-      vite: 4.5.2
+      vite: 4.5.3
     transitivePeerDependencies:
       - supports-color
     dev: true
 
-  /@sveltejs/vite-plugin-svelte@2.4.1(svelte@3.59.1)(vite@4.5.2):
+  /@sveltejs/vite-plugin-svelte@2.4.1(svelte@3.59.1)(vite@4.5.3):
     resolution: {integrity: sha512-bNNKvoRY89ptY7udeBSCmTdCVwkjmMcZ0j/z9J5MuedT8jPjq0zrknAo/jF1sToAza4NVaAgR9AkZoD9oJJmnA==}
     engines: {node: ^14.18.0 || >= 16}
     peerDependencies:
       svelte: ^3.54.0 || ^4.0.0-next.0
       vite: ^4.0.0
     dependencies:
-      '@sveltejs/vite-plugin-svelte-inspector': 1.0.2(@sveltejs/vite-plugin-svelte@2.4.1)(svelte@3.59.1)(vite@4.5.2)
+      '@sveltejs/vite-plugin-svelte-inspector': 1.0.2(@sveltejs/vite-plugin-svelte@2.4.1)(svelte@3.59.1)(vite@4.5.3)
       debug: 4.3.4
       deepmerge: 4.3.1
       kleur: 4.1.5
       magic-string: 0.30.0
       svelte: 3.59.1
       svelte-hmr: 0.15.2(svelte@3.59.1)
-      vite: 4.5.2
-      vitefu: 0.2.4(vite@4.5.2)
+      vite: 4.5.3
+      vitefu: 0.2.4(vite@4.5.3)
     transitivePeerDependencies:
       - supports-color
     dev: true
@@ -1353,8 +1353,8 @@ packages:
     resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
     dev: true
 
-  /vite@4.5.2:
-    resolution: {integrity: sha512-tBCZBNSBbHQkaGyhGCDUGqeo2ph8Fstyp6FMSvTtsXeZSPpSMGlviAOav2hxVTqFcx8Hj/twtWKsMJXNY0xI8w==}
+  /vite@4.5.3:
+    resolution: {integrity: sha512-kQL23kMeX92v3ph7IauVkXkikdDRsYMGTVl5KY2E9OY4ONLvkHf04MDTbnfo6NKxZiDLWzVpP5oTa8hQD8U3dg==}
     engines: {node: ^14.18.0 || >=16.0.0}
     hasBin: true
     peerDependencies:
@@ -1388,7 +1388,7 @@ packages:
       fsevents: 2.3.2
     dev: true
 
-  /vitefu@0.2.4(vite@4.5.2):
+  /vitefu@0.2.4(vite@4.5.3):
     resolution: {integrity: sha512-fanAXjSaf9xXtOOeno8wZXIhgia+CZury481LsDaV++lSvcU2R9Ch2bPh3PYFyoHW+w9LqAeYRISVQjUIew14g==}
     peerDependencies:
       vite: ^3.0.0 || ^4.0.0
@@ -1396,7 +1396,7 @@ packages:
       vite:
         optional: true
     dependencies:
-      vite: 4.5.2
+      vite: 4.5.3
     dev: true
 
   /wrappy@1.0.2:

From cbe81d6e6c8a4faf1cd807d62bb46779983fe754 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Thu, 4 Apr 2024 18:39:01 +0100
Subject: [PATCH 14/49] DOC: reword.

---
 doc/book/random/vector_intrinsics.Rmd | 86 +++++++++++++++++----------
 1 file changed, 53 insertions(+), 33 deletions(-)

diff --git a/doc/book/random/vector_intrinsics.Rmd b/doc/book/random/vector_intrinsics.Rmd
index 2ff98df46..f915a08d9 100644
--- a/doc/book/random/vector_intrinsics.Rmd
+++ b/doc/book/random/vector_intrinsics.Rmd
@@ -1,9 +1,15 @@
 # Super Fast Separable Convolutions
 
-Sounds boring? I promise you this is going to be way more interesting than you
-might think. While this is a common operation in low-level operation, there is
-quite a few interesting things to learn how to leverage the hardware to
-accelerate computing.
+Sounds boring?
+
+I promise you this is going to be way more interesting than you might think.
+While this is a common operation in low-level operation, there is quite a few
+interesting things to learn how to use the hardware to accelerate computing.
+This topic can be read nonlinearly and you can directly move to the sections of
+interest.
+
+In a nutshell, I can give you some tools to reimplement *algorithms that run
+much faster than OpenCV*.
 
 Story time before we dive into the implementation of the Gaussian blur.
 
@@ -13,10 +19,10 @@ Once, I applied for a C++ technical leadership role for some company. I did a
 behavioral interview with their CTO. He then told me who I would be interviewing
 with in the next interviews. A few days later, he called me back for a quick
 chat. He ended up telling me they would not move forward after examining
-*Sara*'s image processing code. Without really explaining why, the most likely
-reason I could come up with was that one of their senior engineers disqualified
-me because all he cared about was whether I could understand and write code with
-CPU vector instructions.
+*Sara*'s image processing code. He did not really explain why. Nevertheless, the
+most likely explanation I could come up with was that one of their senior
+engineers disqualified me because all he seemed to care about was whether I
+could understand and write code with CPU vector instructions.
 
 The CTO profusedly apologized to me. He said politely that I certainly was
 gifted but my code was not up to their standard. They must have deemed that my
@@ -28,19 +34,20 @@ process data as fast as possible. That it really was not that difficult and that
 I could get up to speed fairly quickly, and blah blah blah... That was the only
 reason for which, I believe, they disqualified me.
 
-Having mostly an applied math background, it did sound unfair and hypocritical
-to me. Hypocritical because, if it really was easy, then why can you not learn
-it on the job?
-So, yes, it did make me feel that you are never enough whatever you achieve in
-life.
+Having mostly an applied math background and being a self-taught programmer, it
+did sound unfair and hypocritical to me. Hypocritical because, if it really was
+easy, then why can you not learn it on the job? So, yes, it did make me feel
+that you are never enough whatever you achieve in life.
 Oh, so you're supposed to master every single damn thing of software engineering
 when you start a job?
 Going by the same logic, if David Lowe showcased his SIFT research code, he
 would not qualify for the job either since I learnt from studying his code.
 
-In hindsight I would not have been happy with the job anyways. End of the rant
-and back to the topic: today how can we achieve that? Nowadays, we have some
-answers with Halide, which allows us to do it very elegantly.
+In hindsight I would not have been happy with the job anyways.
+
+End of my angry rant and back to the topic: today how can we achieve that?
+Nowadays, we have some answers with Halide, which allows us to do it very
+elegantly.
 
 ## What is Halide?
 
@@ -55,7 +62,7 @@ of code*. Then Halide will compile the C++ code into usable assembly code.
 The main beauty with Halide is that you decouple:
 
 1. the **algorithm**: in our case, the **separable convolution** and,
-2. the **schedule**: which exploits the different **kinds of parallelism** that
+2. the **schedule**: which leverages the different **kinds of parallelism** that
    the hardware offers,
 
 so that our implementation is as fast as the baseline implementation. In our
@@ -82,17 +89,20 @@ average programmer to identify which schedules work best and those that don't.
 Naysayers will argue: "But Halide is too high level and does too much magic!"
 
 Certainly, optimizing algorithms by writing explicitly CPU vector instructions
-can be done. But we would have to pay a very costly engineering price.
+can be done. But we would have to pay a very costly engineering price. If I were
+the head of a small business, this would be hard to justify the heavy
+engineering time and money, that it entails to.
 
-You would have to optimize for different CPU platforms: *x86-64*, *ARM*,
-*RISC-V* and learn their respective C API to utilize hardware SIMD instructions.
-The resulting code would be much lengthier and harder to maintain than using a
-unified language that allows you to write these in a very few lines of codes.
+Nowadays, would be ready spend time on optimizing for different CPU platforms:
+*x86-64*, *ARM*, *RISC-V* and learn their respective C API to invoke SIMD
+instructions? On top of that, the resulting code would be much lengthier and
+harder to maintain than using a unified language that allows you to write these
+in a very few lines of codes.
 
 Unless this is your full-time job or it's something you really want to learn,
-personally I don't want to spend time into this. Halide has done an excellent
-job in abstracting this at the least on the CPU side. So know who you are and
-decide what you want to do.
+personally I don't want to spend time into this. Either way... Halide has done
+an excellent job in making CPU vector instructions *platform-agnostic* and it
+will help you learn the C API faster.
 
 Let's conclude this paragraph with a few words regarding the GPU acceleration
 with Halide. On the GPU side, Halide is indeed not yet mature. For one thing,
@@ -178,7 +188,7 @@ auto conv_y(const float *in, const float *kernel, float *out,
       for (auto k = 0; k < ksz; ++k)
       {
         // Check the boundary conditions.
-        const auto yk = std::clamp(y - r + k, 0, w - 1);
+        const auto yk = std::clamp(y - r + k, 0, h - 1);
 
         // Accumulate.
         const auto xy = yk * w + x;
@@ -195,8 +205,8 @@ I diligently write unit tests, validate on synthetic tests, check the boundary
 conditions and try it on a real image with a Gaussian kernel. I am happy it
 works reasonably fast when compiling in Release mode on Visual Studio. Job done!
 I am proudly showcasing my code on GitHub. Never complained about it as I never
-bothered about real-time considerations. Instead I would port the code to make
-the algorithm run faster.
+bothered about real-time considerations. Instead I would port the code in CUDA
+to make the algorithm run faster.
 
 ### Issues in the C++ code
 
@@ -209,15 +219,25 @@ Let's enumerate some issues and ideas that we will address later on.
 
 2. It is not clear how the **CPU vector instructions** can be applied in the
    convolution.
-   - For one thing, the boundary checking does not easily allow the compiler to
-   vectorize the C++ code.
+   - For one thing, the boundary check via clamping does not easily allow the
+     compiler to vectorize the C++ code.
    - There is actually a better way to exploit the CPU vector instructions for
      the convolutional operation and the C++ code does not do this way.
+
 3. **Data locality** is a very important aspect, which the parallel row
-   processing does not easily allow to leverage.
+   processing may not easily allow to do so.
+
+   Just like in CUDA programming, processing the image into tiles on CPU is much
+   more efficient in practice.
+
+   Let's note that processing tiles in parallel is a pattern that keeps
+   repeating in optimized linear algebra routines (OpenBLAS etc.), in particular
+   the matrix-matrix multiplication *gemm* operation, which is one fundamental
+   operation in deep learning.
 
-It certainly wasn't the fault of my younger self who did not know any better.
-Let us now address these issues and ideas in Halide.
+There are probably more issues. Hopefully I pointed out the most problematic
+ones. It certainly wasn't the fault of my younger self who did not know any
+better. Let us now address these issues and ideas in Halide.
 
 
 ## A Very Fast Implementation in Halide

From f07a2d5c38c5442ce69201183f280c5bc458addb Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Fri, 5 Apr 2024 22:50:14 +0100
Subject: [PATCH 15/49] DOC: save work.

---
 doc/book/random/vector_intrinsics.Rmd | 36 +++++++++++++--------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/doc/book/random/vector_intrinsics.Rmd b/doc/book/random/vector_intrinsics.Rmd
index 2ff98df46..0d24b3edf 100644
--- a/doc/book/random/vector_intrinsics.Rmd
+++ b/doc/book/random/vector_intrinsics.Rmd
@@ -274,7 +274,7 @@ conv_y(x, y) = Halide::sum(conv_x(x, y + k - kr / 2) * kernel(k));
 
 There are two types of parallelisms on the CPU which we can exploit altogether.
 
-1. Multicore processing:
+1. *Multicore processing*:
 
    This is straightforward to understand. A CPU can be thought as a factory of
    workers, each one of them being called a CPU core.
@@ -286,25 +286,26 @@ There are two types of parallelisms on the CPU which we can exploit altogether.
    OpenMP is one implementation of multicore processing among others to
    parallelise our image filter.
 
-2. Vector instructions:
+2. *Vector instructions*:
 
-   Until I implemented filters with Halide, I really did not understand what CPU
-   vector instrutions were really about.
+   Until I implemented filters with Halide, I did not understand what CPU vector
+   instructions were really about.
 
    Mainly, a CPU vector instruction enables a CPU core to perform arithmetic
    operations on small vectors in a **single** CPU cycle.
    That means that additions, subtractions, dot products on 4D float vectors can
    be executed in a single CPU cycle instead of 4 CPU cycles (7 CPU cycles for
-   the dot product).
+   the dot product). **(PLEASE RESEARCH THIS MORE: this is not true although the
+   idea is there...)**
 
-   That is very significant on a very large scale as we can observe additional
-   4x speed up or more on very very large arrays and therefore image data.
+   The performance is very significant on a large scale as we can observe
+   additional 4x speed up or more.
 
 Nowadays, an Intel CPU that supports AVX-512 vector instructions can perform
 operations on 16D vectors of 32-bit floating point data in a single CPU
 instruction.
 
-So when we combine parallelism 1 and 2 on an 12-core Intel CPU with AVX-512
+By combining parallelisms 1 and 2 on an 12-core Intel CPU with AVX-512
 instructions, an optimized algorithm could in principle be sped up by a factor
 of
 
@@ -314,9 +315,9 @@ of
 
 on single-precision floating point data.
 
-While we should instead use Amdahl's law instead of this naive formula, applying
-Amdahl's law is not so simple and we will be content to just benchmark and
-measure the performance gain.
+While we should instead use *Amdahl's law* instead of this naive formula,
+applying Amdahl's law is not so simple and we will be content to just benchmark
+and measure the performance gain.
 
 Likewise, optimized linear algebra routines like OpenBLAS makes extensive use of
 the two types parallelism on CPU platforms.
@@ -518,10 +519,9 @@ conv_y.compile_to_stmt(
 ```
 
 Halide generates a nicely illustrated HTML document. Still The generated code is
-overwhelming as there is a lot to unpack. How are we going to learn how to read
-a bit of assembly code.
+overwhelming as there is a lot to unpack. How are we going to learn how to read a bit of assembly code?
 
-To digest the algorithmic flow, start by commenting out the different
+To digest the algorithmic flow, I started by commenting out the different
 parallelisms in the schedule:
 
 1. the multicore parallelism
@@ -558,12 +558,12 @@ This is how it looks like.
 style="border: 1px solid #464646;" allowfullscreen>
 </iframe>
 
-It is not practical to view the embedded HTML page, so go to this <a
-href="./random/separable_conv_2d.stmt.html">link</a> to fully explore the
-compiled statement as a full page:
+It is not practical to view the embedded HTML page, so you can click on this
+link <a href="./random/separable_conv_2d.stmt.html">link</a> to fully explore
+the compiled statement as a full page:
 
 
-### Vectorizing in the Convolution
+### Vectorizing the Convolution
 
 Upon inspection of the compiled statement, it turns out that the convolution
 operation is implemented by batch where we calculate 4, 8 or 16 convolved values

From 4b9e1cfbc43a1cf3f340619792d9ee51b3797adb Mon Sep 17 00:00:00 2001
From: Odd Kiva <david.ok8@gmail.com>
Date: Sun, 7 Apr 2024 15:50:55 +0100
Subject: [PATCH 16/49] MAINT: update to Halide 17.0.1

---
 build.ps1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.ps1 b/build.ps1
index 04b14386f..4c5632bd1 100644
--- a/build.ps1
+++ b/build.ps1
@@ -13,7 +13,7 @@ $cmake_toolset = $cmake_vsver[$vsver]
 
 $qt_dir = "C:\local\qt-everywhere-src-6.1.2\qtbase"
 $cudnn_dir = "C:\local\C:\local\cudnn-windows-x86_64-8.8.0.121_cuda12-archive"
-$halide_dir = "C:\local\Halide-16.0.0-x86-64-windows"
+$halide_dir = "C:\local\Halide-17.0.1-x86-64-windows"
 $nvidia_codec_sdk_dir = "C:\local\Video_Codec_SDK_12.1.14"
 $tensorrt_dir = "C:\local\TensorRT-8.6.0.12.Windows10.x86_64.cuda-12.0"
 

From 5a466ca8e49687d4231c0ca7bfb9ed8d1e4aede6 Mon Sep 17 00:00:00 2001
From: Odd Kiva <david.ok8@gmail.com>
Date: Sun, 7 Apr 2024 15:51:11 +0100
Subject: [PATCH 17/49] WIP: fix compile errors with MSVC.

---
 cpp/src/DO/Sara/Graphics/ImageDraw.cpp        |  2 +-
 .../DO/Sara/NeuralNetworks/Darknet/Debug.hpp  | 28 +++++++++++--------
 .../DO/Shakti/Cuda/TensorRT/DarknetParser.cpp |  4 +++
 ...st_neuralnetworks_tensorrt_onnx_parser.cpp |  2 ++
 4 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/cpp/src/DO/Sara/Graphics/ImageDraw.cpp b/cpp/src/DO/Sara/Graphics/ImageDraw.cpp
index 0a0d6db80..85843493c 100644
--- a/cpp/src/DO/Sara/Graphics/ImageDraw.cpp
+++ b/cpp/src/DO/Sara/Graphics/ImageDraw.cpp
@@ -186,7 +186,7 @@ namespace DO { namespace Sara {
 
     const auto fm = QFontMetrics{font};
     const auto qstr = QString::QString::fromLocal8Bit(text.c_str());
-    const auto qstr_bbox = fm.boundingRect(qstr);
+    const auto qstr_bbox = QRectF{fm.boundingRect(qstr)};
 
     return {
         qstr_bbox.x(),
diff --git a/cpp/src/DO/Sara/NeuralNetworks/Darknet/Debug.hpp b/cpp/src/DO/Sara/NeuralNetworks/Darknet/Debug.hpp
index 286f20cfa..17027f4cf 100644
--- a/cpp/src/DO/Sara/NeuralNetworks/Darknet/Debug.hpp
+++ b/cpp/src/DO/Sara/NeuralNetworks/Darknet/Debug.hpp
@@ -24,11 +24,12 @@ namespace DO::Sara::Darknet {
   // CAVEAT: this is sensitive to the CPU architecture endianness.
   template <typename T, int N>
   inline auto write_tensor(const TensorView_<T, N>& x,
-                           const std::string& filepath) -> void
+                           const std::filesystem::path& filepath) -> void
   {
-    auto file = std::ofstream{filepath, std::ios::binary};
+    auto file = std::ofstream{filepath.string(), std::ios::binary};
     if (!file.is_open())
-      throw std::runtime_error{"Error: could not open file: " + filepath + "!"};
+      throw std::runtime_error{
+          "Error: could not open file: " + filepath.string() + "!"};
 
     // Write the tensor dimension.
     file.write(reinterpret_cast<const char*>(x.sizes().data()),
@@ -39,11 +40,13 @@ namespace DO::Sara::Darknet {
   }
 
   // CAVEAT: this is sensitive to the CPU architecture endianness.
-  inline auto read_tensor(const std::string& filepath) -> Tensor_<float, 4>
+  inline auto read_tensor(const std::filesystem::path& filepath)
+      -> Tensor_<float, 4>
   {
-    auto file = std::ifstream{filepath, std::ios::binary};
+    auto file = std::ifstream{filepath.string(), std::ios::binary};
     if (!file.is_open())
-      throw std::runtime_error{"Error: could not open file: " + filepath + "!"};
+      throw std::runtime_error{
+          "Error: could not open file: " + filepath.string() + "!"};
 
     auto sizes = Eigen::Vector4i{};
     file.read(reinterpret_cast<char*>(sizes.data()), sizeof(int) * 4);
@@ -58,7 +61,8 @@ namespace DO::Sara::Darknet {
   }
 
   // CAVEAT: this is sensitive to the CPU architecture endianness.
-  inline auto read_all_intermediate_outputs(const std::string& dir_path)
+  inline auto
+  read_all_intermediate_outputs(const std::filesystem::path& dir_path)
   {
     namespace fs = std::filesystem;
 
@@ -157,9 +161,9 @@ namespace DO::Sara::Darknet {
     }
   }
 
-  inline auto check_convolutional_weights(const Network& model,
-                                          const std::string& data_dirpath)
-      -> void
+  inline auto
+  check_convolutional_weights(const Network& model,
+                              const std::filesystem::path& data_dirpath) -> void
   {
     const auto stringify = [](int n) {
       std::ostringstream ss;
@@ -175,9 +179,9 @@ namespace DO::Sara::Darknet {
         SARA_DEBUG << "Checking convolution weights " << i << std::endl;
 
         const auto weights_fp =
-            data_dirpath + "/kernel-" + stringify(i - 1) + ".bin";
+            data_dirpath / ("/kernel-" + stringify(i - 1) + ".bin");
         const auto biases_fp =
-            data_dirpath + "/bias-" + stringify(i - 1) + ".bin";
+            data_dirpath / ("/bias-" + stringify(i - 1) + ".bin");
 
         const auto w = read_tensor(weights_fp).reshape(conv->weights.w.sizes());
         const auto b = read_tensor(biases_fp);
diff --git a/cpp/src/DO/Shakti/Cuda/TensorRT/DarknetParser.cpp b/cpp/src/DO/Shakti/Cuda/TensorRT/DarknetParser.cpp
index a2f378ebb..117405767 100644
--- a/cpp/src/DO/Shakti/Cuda/TensorRT/DarknetParser.cpp
+++ b/cpp/src/DO/Shakti/Cuda/TensorRT/DarknetParser.cpp
@@ -9,6 +9,10 @@
 // you can obtain one at http://mozilla.org/MPL/2.0/.
 // ========================================================================== //
 
+#if (defined(_WIN32) || defined(_WIN32_WCE)) && !defined(NOMINMAX)
+#  define NOMINMAX
+#endif
+
 #include <DO/Shakti/Cuda/TensorRT/DarknetParser.hpp>
 #include <DO/Shakti/Cuda/TensorRT/IO.hpp>
 #include <DO/Shakti/Cuda/TensorRT/Mish.hpp>
diff --git a/cpp/test/Shakti/Cuda/TensorRT/test_neuralnetworks_tensorrt_onnx_parser.cpp b/cpp/test/Shakti/Cuda/TensorRT/test_neuralnetworks_tensorrt_onnx_parser.cpp
index b04251f1d..88ee67a18 100644
--- a/cpp/test/Shakti/Cuda/TensorRT/test_neuralnetworks_tensorrt_onnx_parser.cpp
+++ b/cpp/test/Shakti/Cuda/TensorRT/test_neuralnetworks_tensorrt_onnx_parser.cpp
@@ -84,6 +84,8 @@ auto serialize_onnx_model_as_tensort_engine(
     throw std::runtime_error{"Failed to serialize the ONNX model!"};
   if (plan->size() == 0)
     throw std::runtime_error{"The byte size of the serialized engine is 0!"};
+
+  return plan;
 }
 
 

From 7a99a71ef5d7183fb35d53432e97c2dc53872690 Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Sun, 7 Apr 2024 16:55:01 +0100
Subject: [PATCH 18/49] WIP: save work.

---
 .../Sara/MultiViewGeometry/FeatureGraph.hpp   | 24 ++++++++++++++-----
 cpp/src/DO/Sara/SfM/Graph/BundleAdjuster.hpp  |  2 +-
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp | 11 +++++----
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/cpp/src/DO/Sara/MultiViewGeometry/FeatureGraph.hpp b/cpp/src/DO/Sara/MultiViewGeometry/FeatureGraph.hpp
index c0c5d8a87..264102d73 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/FeatureGraph.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/FeatureGraph.hpp
@@ -30,11 +30,17 @@ namespace DO::Sara {
   //! @defgroup FeatureGraph Feature Correspondence Graph
   //! @{
 
-  //! @brief Feature global ID (GID).
+  //! @brief Feature Global ID (GID).
   struct FeatureGID
   {
-    int image_id{-1};
-    int local_id{-1};
+    using image_id_t = int;
+    using local_id_t = int;
+
+    static constexpr auto undefined_image_id = -1;
+    static constexpr auto undefined_local_id = -1;
+
+    image_id_t image_id = undefined_image_id;
+    local_id_t local_id = undefined_local_id;
 
     auto operator==(const FeatureGID& other) const -> bool
     {
@@ -49,11 +55,17 @@ namespace DO::Sara {
   };
 
 
-  //! @brief Match global ID (GID).
+  //! @brief Match Global ID (GID).
   struct MatchGID
   {
-    int ij;
-    int m;
+    using image_pair_t = int;
+    using match_t = int;
+
+    static constexpr auto undefined_image_pair = -1;
+    static constexpr auto undefined_match = -1;
+
+    image_pair_t ij = undefined_image_pair;
+    match_t m = undefined_match;
 
     auto operator==(const MatchGID& other) const -> bool
     {
diff --git a/cpp/src/DO/Sara/SfM/Graph/BundleAdjuster.hpp b/cpp/src/DO/Sara/SfM/Graph/BundleAdjuster.hpp
index 673b9c948..2a0cdba0f 100644
--- a/cpp/src/DO/Sara/SfM/Graph/BundleAdjuster.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/BundleAdjuster.hpp
@@ -24,7 +24,7 @@ namespace DO::Sara {
   private:
     CameraPoseGraph& _camera_pose_graph;
     const FeatureGraph& _feature_graph;
-    PointCloud& _point_cloud;
+    PointCloudManipulator::PointCloud& _point_cloud;
   };
 
 }  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
index 9202f4a41..7d90fb4fa 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
@@ -16,6 +16,7 @@
 #include <DO/Sara/Core/Image.hpp>
 #include <DO/Sara/Core/Pixel.hpp>
 #include <DO/Sara/MultiViewGeometry/Geometry/QuaternionBasedPose.hpp>
+#include <DO/Sara/MultiViewGeometry/Geometry/EssentialMatrix.hpp>
 #include <DO/Sara/SfM/Graph/ImageFeatures.hpp>
 
 #include <boost/graph/adjacency_list.hpp>
@@ -28,13 +29,15 @@ namespace DO::Sara {
 
   struct RelativePoseEdge
   {
-    int src_camera = -1;
-    int dst_camera = -1;
+    using camera_id_t = int;
+    static constexpr auto undefined_camera_id = -1;
+
+    camera_id_t src_camera = undefined_camera_id;
+    camera_id_t dst_camera = undefined_camera_id;
 
     std::vector<std::pair<int, int>> _matches;
     std::vector<std::uint8_t> _inliers;
-    Eigen::Matrix3d R;
-    Eigen::Vector3d t;
+    Motion _motion;
   };
 
   struct CameraPoseGraph

From 32c1b619922d984b36eed4bed8d13b87292772fa Mon Sep 17 00:00:00 2001
From: Odd Kiva <david.ok8@gmail.com>
Date: Sun, 7 Apr 2024 16:42:21 +0100
Subject: [PATCH 19/49] MAINT: fix compile error with MSVC.

---
 cpp/examples/Shakti/TensorRT/CMakeLists.txt   | 14 +++-
 cpp/examples/Shakti/TensorRT/downsample.cu    | 61 +++++++++++++++++
 cpp/examples/Shakti/TensorRT/downsample.hpp   | 23 +++++++
 ...example.cu => tensorrt_yolov4_example.cpp} | 66 +------------------
 4 files changed, 97 insertions(+), 67 deletions(-)
 create mode 100644 cpp/examples/Shakti/TensorRT/downsample.cu
 create mode 100644 cpp/examples/Shakti/TensorRT/downsample.hpp
 rename cpp/examples/Shakti/TensorRT/{tensorrt_yolov4_example.cu => tensorrt_yolov4_example.cpp} (82%)

diff --git a/cpp/examples/Shakti/TensorRT/CMakeLists.txt b/cpp/examples/Shakti/TensorRT/CMakeLists.txt
index 85105247b..582b25ea1 100644
--- a/cpp/examples/Shakti/TensorRT/CMakeLists.txt
+++ b/cpp/examples/Shakti/TensorRT/CMakeLists.txt
@@ -2,9 +2,16 @@ if(NOT CMAKE_CUDA_COMPILER OR NOT TensorRT_FOUND)
   return()
 endif()
 
-file(GLOB TRT_SOURCE_FILES FILES *.cu)
+# This is a workaround with MSVC to avoid fighting with CMake and CUDA.
+add_library(tensorrt_yolov4_utilities downsample.hpp downsample.cu)
+target_link_libraries(tensorrt_yolov4_utilities PRIVATE DO::Shakti::Cuda::TensorRT)
+set_property(TARGET tensorrt_yolov4_utilities
+             PROPERTY FOLDER "Examples/Shakti/NeuralNetworks")
 
-foreach(file ${TRT_SOURCE_FILES})
+
+list(APPEND TRT_EXAMPLE_FILES FILES tensorrt_yolov4_example.cpp)
+
+foreach(file ${TRT_EXAMPLE_FILES})
   get_filename_component(filename ${file} NAME_WE)
 
   add_executable(${filename} ${file})
@@ -24,7 +31,8 @@ foreach(file ${TRT_SOURCE_FILES})
             DO::Shakti::Cuda::Utilities
             DO::Shakti::Cuda::TensorRT
             tinycolormap
-            fmt::fmt)
+            fmt::fmt
+            tensorrt_yolov4_utilities)
 
   set_target_properties(
     ${filename} PROPERTIES COMPILE_FLAGS ${SARA_DEFINITIONS}
diff --git a/cpp/examples/Shakti/TensorRT/downsample.cu b/cpp/examples/Shakti/TensorRT/downsample.cu
new file mode 100644
index 000000000..ec2585b54
--- /dev/null
+++ b/cpp/examples/Shakti/TensorRT/downsample.cu
@@ -0,0 +1,61 @@
+#include "downsample.hpp"
+
+__global__ auto
+naive_downsample_and_transpose_impl(float* out_chw, const std::uint8_t* in_hwc,
+                                    const int wout, const int hout,
+                                    const int win, const int hin) -> void
+{
+  const int c = blockIdx.x * blockDim.x + threadIdx.x;
+  const int yout = blockIdx.y * blockDim.y + threadIdx.y;
+  const int xout = blockIdx.z * blockDim.z + threadIdx.z;
+
+  if (xout >= wout || yout >= hout || c >= 3)
+    return;
+
+  const float sx = float(win) / float(wout);
+  const float sy = float(hin) / float(hout);
+
+  int xin = int(xout * sx + 0.5f);
+  int yin = int(yout * sy + 0.5f);
+
+  if (xin >= win)
+    xin = win - 1;
+  if (yin >= hin)
+    yin = hin - 1;
+
+  const int gi_out = c * hout * wout + yout * wout + xout;
+  const int gi_in = yin * win * 3 + xin * 3 + c;
+
+  static constexpr auto normalize_factor = 1 / 255.f;
+  out_chw[gi_out] = static_cast<float>(in_hwc[gi_in]) * normalize_factor;
+}
+
+auto naive_downsample_and_transpose(CudaManagedTensor3f& tensor_chw_resized_32f,
+                                    const CudaManagedTensor3ub& tensor_hwc_8u)
+    -> void
+{
+  // Data order: H W C
+  //             0 1 2
+  const auto in_hwc = tensor_hwc_8u.data();
+  const auto win = tensor_hwc_8u.sizes()(1);
+  const auto hin = tensor_hwc_8u.sizes()(0);
+
+  // Data order: C H W
+  //             0 1 2
+  auto out_chw = tensor_chw_resized_32f.data();
+  const auto hout = tensor_chw_resized_32f.sizes()(1);
+  const auto wout = tensor_chw_resized_32f.sizes()(2);
+
+  static const auto threads_per_block = dim3{4, 16, 16};
+  static const auto num_blocks = dim3{
+      1,  //
+      (hout + threads_per_block.y - 1) / threads_per_block.y,
+      (wout + threads_per_block.z - 1) / threads_per_block.z  //
+  };
+
+  naive_downsample_and_transpose_impl<<<num_blocks, threads_per_block>>>(
+      out_chw, in_hwc,  //
+      wout, hout,       //
+      win, hin          //
+  );
+}
\ No newline at end of file
diff --git a/cpp/examples/Shakti/TensorRT/downsample.hpp b/cpp/examples/Shakti/TensorRT/downsample.hpp
new file mode 100644
index 000000000..b6d7bab3a
--- /dev/null
+++ b/cpp/examples/Shakti/TensorRT/downsample.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#if (defined(_WIN32) || defined(_WIN32_WCE)) && !defined(NOMINMAX)
+#  define NOMINMAX
+#endif
+
+#include <DO/Shakti/Cuda/MultiArray/ManagedMemoryAllocator.hpp>
+#include <DO/Shakti/Cuda/TensorRT/InferenceEngine.hpp>
+
+#include <cstdint>
+
+
+namespace trt = DO::Shakti::TensorRT;
+
+
+using CudaManagedTensor3ub =
+    trt::InferenceEngine::ManagedTensor<std::uint8_t, 3>;
+using CudaManagedTensor3f = trt::InferenceEngine::ManagedTensor<float, 3>;
+
+
+auto naive_downsample_and_transpose(CudaManagedTensor3f& tensor_chw_resized_32f,
+                                    const CudaManagedTensor3ub& tensor_hwc_8u)
+    -> void;
\ No newline at end of file
diff --git a/cpp/examples/Shakti/TensorRT/tensorrt_yolov4_example.cu b/cpp/examples/Shakti/TensorRT/tensorrt_yolov4_example.cpp
similarity index 82%
rename from cpp/examples/Shakti/TensorRT/tensorrt_yolov4_example.cu
rename to cpp/examples/Shakti/TensorRT/tensorrt_yolov4_example.cpp
index 96ea96a46..70f61ccd4 100644
--- a/cpp/examples/Shakti/TensorRT/tensorrt_yolov4_example.cu
+++ b/cpp/examples/Shakti/TensorRT/tensorrt_yolov4_example.cpp
@@ -9,10 +9,10 @@
 // you can obtain one at http://mozilla.org/MPL/2.0/.
 // ========================================================================== //
 
-#include <DO/Shakti/Cuda/MultiArray/ManagedMemoryAllocator.hpp>
+#include "downsample.hpp"
+
 #include <DO/Shakti/Cuda/TensorRT/DarknetParser.hpp>
 #include <DO/Shakti/Cuda/TensorRT/IO.hpp>
-#include <DO/Shakti/Cuda/TensorRT/InferenceEngine.hpp>
 #include <DO/Shakti/Cuda/TensorRT/Yolo.hpp>
 
 #include <DO/Sara/Graphics.hpp>
@@ -41,68 +41,6 @@ using CudaManagedTensor3ub =
 using CudaManagedTensor3f = trt::InferenceEngine::ManagedTensor<float, 3>;
 
 
-__global__ auto naive_downsample_and_transpose(float* out_chw,
-                                               const std::uint8_t* in_hwc,
-                                               const int wout, const int hout,
-                                               const int win, const int hin)
-    -> void
-{
-  const int c = blockIdx.x * blockDim.x + threadIdx.x;
-  const int yout = blockIdx.y * blockDim.y + threadIdx.y;
-  const int xout = blockIdx.z * blockDim.z + threadIdx.z;
-
-  if (xout >= wout || yout >= hout || c >= 3)
-    return;
-
-  const float sx = float(win) / float(wout);
-  const float sy = float(hin) / float(hout);
-
-  int xin = int(xout * sx + 0.5f);
-  int yin = int(yout * sy + 0.5f);
-
-  if (xin >= win)
-    xin = win - 1;
-  if (yin >= hin)
-    yin = hin - 1;
-
-  const int gi_out = c * hout * wout + yout * wout + xout;
-  const int gi_in = yin * win * 3 + xin * 3 + c;
-
-  static constexpr auto normalize_factor = 1 / 255.f;
-  out_chw[gi_out] = static_cast<float>(in_hwc[gi_in]) * normalize_factor;
-}
-
-auto naive_downsample_and_transpose(CudaManagedTensor3f& tensor_chw_resized_32f,
-                                    const CudaManagedTensor3ub& tensor_hwc_8u)
-    -> void
-{
-  // Data order: H W C
-  //             0 1 2
-  const auto in_hwc = tensor_hwc_8u.data();
-  const auto win = tensor_hwc_8u.sizes()(1);
-  const auto hin = tensor_hwc_8u.sizes()(0);
-
-  // Data order: C H W
-  //             0 1 2
-  auto out_chw = tensor_chw_resized_32f.data();
-  const auto hout = tensor_chw_resized_32f.sizes()(1);
-  const auto wout = tensor_chw_resized_32f.sizes()(2);
-
-  static const auto threads_per_block = dim3{4, 16, 16};
-  static const auto num_blocks = dim3{
-      1,  //
-      (hout + threads_per_block.y - 1) / threads_per_block.y,
-      (wout + threads_per_block.z - 1) / threads_per_block.z  //
-  };
-
-  naive_downsample_and_transpose<<<num_blocks, threads_per_block>>>(
-      out_chw, in_hwc,  //
-      wout, hout,       //
-      win, hin          //
-  );
-}
-
-
 class Yolo
 {
 public:

From 18e9f2c6cbd93c2e71a890bee11482c9e8fcc9da Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Sun, 7 Apr 2024 18:28:37 +0100
Subject: [PATCH 20/49] MAINT: fix script.

---
 cpp/examples/Shakti/TensorRT/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/examples/Shakti/TensorRT/CMakeLists.txt b/cpp/examples/Shakti/TensorRT/CMakeLists.txt
index 582b25ea1..ab3f41d93 100644
--- a/cpp/examples/Shakti/TensorRT/CMakeLists.txt
+++ b/cpp/examples/Shakti/TensorRT/CMakeLists.txt
@@ -4,12 +4,12 @@ endif()
 
 # This is a workaround with MSVC to avoid fighting with CMake and CUDA.
 add_library(tensorrt_yolov4_utilities downsample.hpp downsample.cu)
-target_link_libraries(tensorrt_yolov4_utilities PRIVATE DO::Shakti::Cuda::TensorRT)
+target_link_libraries(tensorrt_yolov4_utilities
+                      PRIVATE DO::Shakti::Cuda::TensorRT)
 set_property(TARGET tensorrt_yolov4_utilities
              PROPERTY FOLDER "Examples/Shakti/NeuralNetworks")
 
-
-list(APPEND TRT_EXAMPLE_FILES FILES tensorrt_yolov4_example.cpp)
+list(APPEND TRT_EXAMPLE_FILES tensorrt_yolov4_example.cpp)
 
 foreach(file ${TRT_EXAMPLE_FILES})
   get_filename_component(filename ${file} NAME_WE)

From fb91dd58b203ef7ffe1e460e0802508897f6295f Mon Sep 17 00:00:00 2001
From: Odd Kiva <david.ok8@gmail.com>
Date: Mon, 8 Apr 2024 11:16:59 +0100
Subject: [PATCH 21/49] MAINT: don't force OpenCL usage with NVIDIA hardware.

Not worth fighting...
---
 cpp/src/DO/Shakti/OpenCL/CMakeLists.txt | 4 ++++
 cpp/test/Shakti/OpenCL/CMakeLists.txt   | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/cpp/src/DO/Shakti/OpenCL/CMakeLists.txt b/cpp/src/DO/Shakti/OpenCL/CMakeLists.txt
index 214a04cca..477dcc2d4 100644
--- a/cpp/src/DO/Shakti/OpenCL/CMakeLists.txt
+++ b/cpp/src/DO/Shakti/OpenCL/CMakeLists.txt
@@ -1,3 +1,7 @@
+if (NOT OpenCL_FOUND)
+  return()
+endif ()
+
 file(GLOB DO_Sara_OpenCL_FILES FILES *.hpp)
 set_source_files_properties(${DO_Sara_OpenCL_FILES} PROPERTIES LANGUAGE CXX)
 
diff --git a/cpp/test/Shakti/OpenCL/CMakeLists.txt b/cpp/test/Shakti/OpenCL/CMakeLists.txt
index adab7653f..21d001d70 100644
--- a/cpp/test/Shakti/OpenCL/CMakeLists.txt
+++ b/cpp/test/Shakti/OpenCL/CMakeLists.txt
@@ -1,3 +1,7 @@
+if (NOT OpenCL_FOUND)
+  return()
+endif ()
+
 file(GLOB test_OpenCL_SOURCE_FILES FILES test_*.cpp)
 
 foreach(file IN LISTS test_OpenCL_SOURCE_FILES)

From 7b9c95ea2c46651563bfb675a50aa0d66301a132 Mon Sep 17 00:00:00 2001
From: Odd Kiva <david.ok8@gmail.com>
Date: Mon, 8 Apr 2024 12:21:34 +0100
Subject: [PATCH 22/49] MAINT: fix CMake script.

---
 cpp/examples/Shakti/OpenCL/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/examples/Shakti/OpenCL/CMakeLists.txt b/cpp/examples/Shakti/OpenCL/CMakeLists.txt
index b3f945c69..e73e8efc6 100644
--- a/cpp/examples/Shakti/OpenCL/CMakeLists.txt
+++ b/cpp/examples/Shakti/OpenCL/CMakeLists.txt
@@ -1,2 +1,6 @@
+if (NOT OpenCL_FOUND)
+  return()
+endif ()
+
 add_subdirectory(hello_opencl)
 add_subdirectory(image_processing)

From 0f2143a3ffec4889e6201957c8f8e530dbda140b Mon Sep 17 00:00:00 2001
From: Odd Kiva <david.ok8@gmail.com>
Date: Mon, 8 Apr 2024 12:42:10 +0100
Subject: [PATCH 23/49] MAINT: fix compile warning.

---
 cpp/test/Sara/Core/test_core_tensor.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/test/Sara/Core/test_core_tensor.cpp b/cpp/test/Sara/Core/test_core_tensor.cpp
index f37afe00a..ea0bdb58a 100644
--- a/cpp/test/Sara/Core/test_core_tensor.cpp
+++ b/cpp/test/Sara/Core/test_core_tensor.cpp
@@ -113,16 +113,17 @@ BOOST_AUTO_TEST_CASE(test_matrix_case)
   auto img = Image<Matrix<float, M, N>>{W, H};  // Indexed by (y, x, j, i).
 
   auto img_elem = Matrix<float, M, N>{};
-  std::iota(img_elem.data(), img_elem.data() + M * N, 0);
+  for (auto i = 0; i < M * N; ++i)
+    img_elem.data()[i] = static_cast<float>(i);
   img.flat_array().fill(img_elem);
 
   auto m = img.matrix();
-  m(0, 0) *= 0;
-  m(0, 1) *= 1;
-  m(1, 0) *= 2;
-  m(1, 1) *= 3;
-  m(2, 0) *= 4;
-  m(2, 1) *= 5;
+  // clang-format off
+  m(0, 0) *= 0; m(0, 1) *= 1;
+  m(1, 0) *= 2; m(1, 1) *= 3;
+  m(2, 0) *= 4; m(2, 1) *= 5;
+  // clang-format on
+
   /*
    * [[0, 0, 0],  [[0, 2, 4],
    *  [0, 0, 0]]   [1, 3, 5]]

From f07b4b0bf0625c02892ba82e2de009bbfaa515a2 Mon Sep 17 00:00:00 2001
From: Odd Kiva <david.ok8@gmail.com>
Date: Mon, 8 Apr 2024 13:09:10 +0100
Subject: [PATCH 24/49] MAINT: fix compile errors with MSVC.

---
 .../Calibration/OmnidirectionalCameraReprojectionError.hpp  | 6 ++++++
 .../Calibration/PinholeCameraReprojectionError.hpp          | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/cpp/src/DO/Sara/MultiViewGeometry/Calibration/OmnidirectionalCameraReprojectionError.hpp b/cpp/src/DO/Sara/MultiViewGeometry/Calibration/OmnidirectionalCameraReprojectionError.hpp
index 94fc0c5c4..8d1f17789 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/Calibration/OmnidirectionalCameraReprojectionError.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/Calibration/OmnidirectionalCameraReprojectionError.hpp
@@ -9,6 +9,12 @@
 // you can obtain one at http://mozilla.org/MPL/2.0/.
 // ========================================================================== //
 
+#pragma once
+
+#ifndef GLOG_USE_GLOG_EXPORT
+#  define GLOG_USE_GLOG_EXPORT
+#endif
+
 #include <Eigen/Core>
 
 #include <ceres/ceres.h>
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/Calibration/PinholeCameraReprojectionError.hpp b/cpp/src/DO/Sara/MultiViewGeometry/Calibration/PinholeCameraReprojectionError.hpp
index 666b13c73..fec158066 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/Calibration/PinholeCameraReprojectionError.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/Calibration/PinholeCameraReprojectionError.hpp
@@ -9,6 +9,12 @@
 // you can obtain one at http://mozilla.org/MPL/2.0/.
 // ========================================================================== //
 
+#pragma once
+
+#ifndef GLOG_USE_GLOG_EXPORT
+#  define GLOG_USE_GLOG_EXPORT
+#endif
+
 #include <Eigen/Core>
 
 #include <ceres/ceres.h>

From cb938e3667fdc7d48005dc8e10b76c94e562e396 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Mon, 8 Apr 2024 14:47:56 +0100
Subject: [PATCH 25/49] MAINT: invoke dark magic for CMake and CUDA.

---
 cpp/src/DO/Sara/UseDOSaraCore.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/DO/Sara/UseDOSaraCore.cmake b/cpp/src/DO/Sara/UseDOSaraCore.cmake
index 5fed5b867..fbdae9708 100644
--- a/cpp/src/DO/Sara/UseDOSaraCore.cmake
+++ b/cpp/src/DO/Sara/UseDOSaraCore.cmake
@@ -19,8 +19,8 @@ if(NOT DO_Sara_Core_ADDED)
   target_compile_options(
     DO_Sara_Core
     PUBLIC
-      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<PLATFORM_ID:Linux>>:-Xcudafe
-      "--diag_suppress=20236 --diag_suppress=20012">
+      "$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<PLATFORM_ID:Linux>>:SHELL:-Xcudafe --diag_suppress=20236>"
+      "$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<PLATFORM_ID:Linux>>:SHELL:-Xcudafe --diag_suppress=20012>"
       $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
   )
 endif()

From f0209b1f9bbc8807fab7da51fdd13095b805a6f0 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Mon, 8 Apr 2024 19:55:50 +0100
Subject: [PATCH 26/49] WIP: save work on visual odometry.

---
 .../visual_odometry_example.cpp               | 58 ++++++++------
 .../SfM/BuildingBlocks/v2/FeatureTracker.hpp  | 39 +++++++++
 .../v2/RelativePoseEstimator.hpp              | 80 +++++++++++++++++++
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp | 77 ++++++++++++++++++
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp | 57 +++++++++----
 cpp/src/DO/Sara/SfM/Graph/FeatureGID.hpp      |  1 +
 cpp/src/DO/Sara/SfM/Graph/ImageFeatures.hpp   | 40 ----------
 cpp/src/DO/Sara/UseDOSaraSfM.cmake            |  3 +-
 8 files changed, 274 insertions(+), 81 deletions(-)
 create mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/v2/FeatureTracker.hpp
 create mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/v2/RelativePoseEstimator.hpp
 create mode 100644 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
 delete mode 100644 cpp/src/DO/Sara/SfM/Graph/ImageFeatures.hpp

diff --git a/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example.cpp b/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example.cpp
index a6e891510..365e59c04 100644
--- a/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example.cpp
+++ b/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example.cpp
@@ -73,11 +73,18 @@ class SingleWindowApp
     glfwSetWindowSizeCallback(_window, window_size_callback);
   }
 
-  //! @brief Note: RAII does not work on OpenGL applications.
-  //!
-  //! So the destructor gets a default implementation and we neeed to explicitly
-  //! call the terminate method.
-  ~SingleWindowApp() = default;
+  ~SingleWindowApp()
+  {
+    // Destroy GL objects.
+    deinit_gl_resources();
+
+    // Destroy GLFW.
+    if (_window != nullptr)
+      glfwDestroyWindow(_window);
+
+    if (_glfw_initialized)
+      glfwTerminate();
+  }
 
   auto set_config(const fs::path& video_path,
                   const sara::v2::BrownConradyDistortionModel<double>& camera)
@@ -125,17 +132,6 @@ class SingleWindowApp
     }
   }
 
-  auto terminate() -> void
-  {
-    // Destroy GL objects.
-    deinit_gl_resources();
-
-    // Destroy GLFW.
-    if (_window != nullptr)
-      glfwDestroyWindow(_window);
-    glfwTerminate();
-  }
-
 private:
   auto init_opengl() -> void
   {
@@ -264,8 +260,13 @@ class SingleWindowApp
 private:
   static auto init_glfw() -> void
   {
+    if (_glfw_initialized)
+      throw std::runtime_error{
+          "Error: cannot instantiate more than one GLFW application!"};
+
     // Initialize the windows manager.
-    if (!glfwInit())
+    _glfw_initialized = glfwInit();
+    if (!_glfw_initialized)
       throw std::runtime_error{"Error: failed to initialize GLFW!"};
 
     glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
@@ -292,6 +293,8 @@ class SingleWindowApp
   }
 
 private:
+  static bool _glfw_initialized;
+
   GLFWwindow* _window = nullptr;
   //! @brief Framebuffer sizes
   //! We want to use this and not the window sizes because of MacOS retina
@@ -327,6 +330,8 @@ class SingleWindowApp
   float _point_size = 5.f;
 };
 
+bool SingleWindowApp::_glfw_initialized = false;
+
 
 auto main(int const argc, char** const argv) -> int
 {
@@ -340,7 +345,7 @@ auto main(int const argc, char** const argv) -> int
   {
     std::cout << fmt::format("Usage: {} VIDEO_PATH\n",
                              std::string_view{argv[0]});
-    return 1;
+    return EXIT_FAILURE;
   }
 
   const auto video_path = fs::path{argv[1]};
@@ -360,10 +365,17 @@ auto main(int const argc, char** const argv) -> int
     camera.p() << -0.0003137658969742134, 0.00021943576376532096;
   }
 
-  auto app = SingleWindowApp{{800, 600}, "Odometry: " + video_path.string()};
-  app.set_config(video_path, camera);
-  app.run();
-  app.terminate();
+  try
+  {
+    auto app = SingleWindowApp{{800, 600}, "Odometry: " + video_path.string()};
+    app.set_config(video_path, camera);
+    app.run();
+  }
+  catch (std::exception& e)
+  {
+    std::cerr << e.what() << std::endl;
+    return EXIT_FAILURE;
+  }
 
-  return 0;
+  return EXIT_SUCCESS;
 }
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/FeatureTracker.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/FeatureTracker.hpp
new file mode 100644
index 000000000..fdac7975e
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/FeatureTracker.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <DO/Sara/Core/Image.hpp>
+#include <DO/Sara/Features/Feature.hpp>
+
+#include <DO/Sara/FeatureDetectors/SIFT.hpp>
+#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
+
+
+namespace DO::Sara::v2 {
+
+  struct FeatureTracker
+  {
+    ImagePyramidParams image_pyr_params = ImagePyramidParams(0);
+    float sift_nn_ratio = 0.6f;
+    std::size_t num_matches_max = 1000u;
+
+    auto detect_features(const ImageView<float>& image,
+                         KeypointList<OERegion, float>& keypoints) const -> void
+    {
+      keypoints = compute_sift_keypoints(image, image_pyr_params);
+    }
+
+    auto match_features(const KeypointList<OERegion, float>& src_keys,
+                        const KeypointList<OERegion, float>& dst_keys) const
+        -> std::vector<Match>
+    {
+      if (features(src_keys).empty() || features(dst_keys).empty())
+        return {};
+
+      auto matches = match(src_keys, dst_keys, sift_nn_ratio);
+      if (matches.size() > num_matches_max)
+        matches.resize(num_matches_max);
+
+      return matches;
+    }
+  };
+
+}  // namespace DO::Sara::v2
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/RelativePoseEstimator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/RelativePoseEstimator.hpp
new file mode 100644
index 000000000..ff3829a3a
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/RelativePoseEstimator.hpp
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <DO/Sara/FeatureDetectors/SIFT.hpp>
+#include <DO/Sara/Features/KeypointList.hpp>
+
+#include <DO/Sara/MultiViewGeometry/Camera/v2/BrownConradyCamera.hpp>
+#include <DO/Sara/MultiViewGeometry/DataTransformations.hpp>
+#include <DO/Sara/MultiViewGeometry/MinimalSolvers/ErrorMeasures.hpp>
+#include <DO/Sara/MultiViewGeometry/MinimalSolvers/InlierPredicates.hpp>
+#include <DO/Sara/MultiViewGeometry/MinimalSolvers/RelativePoseSolver.hpp>
+
+#include <DO/Sara/RANSAC/RANSACv2.hpp>
+
+#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
+
+
+namespace DO::Sara::v2 {
+
+  struct RelativePoseEstimator
+  {
+    int ransac_iterations_max = 1000;
+    double ransac_confidence = 0.999;
+    double err_thres = 4.;
+
+    // Use Stewenius' algorithm instead of Nister's for now. The polynomial
+    // solver must have some convergence problems.
+    const RelativePoseSolver<SteweniusFivePointAlgorithm> _solver;
+    CheiralAndEpipolarConsistency _inlier_predicate;
+
+    Eigen::Matrix3d _K;
+    Eigen::Matrix3d _K_inv;
+
+    RelativePoseEstimator(const v2::BrownConradyDistortionModel<double>& camera)
+    {
+      configure(camera);
+    }
+
+    auto configure(const v2::BrownConradyDistortionModel<double>& camera)
+        -> void
+    {
+      _K = camera.calibration_matrix();
+      _K_inv = _K.inverse();
+
+      _inlier_predicate.distance.K1_inv = _K_inv;
+      _inlier_predicate.distance.K2_inv = _K_inv;
+      _inlier_predicate.err_threshold = err_thres;
+    }
+
+    auto estimate_relative_pose(const KeypointList<OERegion, float>& src_keys,
+                                const KeypointList<OERegion, float>& dst_keys,
+                                std::vector<Match>& matches) const
+        -> std::tuple<TwoViewGeometry, Tensor_<bool, 1>, Tensor_<int, 1>>
+    {
+      print_stage("Estimating the relative pose...");
+      if (matches.empty())
+      {
+        SARA_DEBUG << "Skipping relative pose estimation\n";
+        return {};
+      }
+
+      const auto& f0 = features(src_keys);
+      const auto& f1 = features(dst_keys);
+      const auto u = std::array{
+          homogeneous(extract_centers(f0)).cast<double>(),
+          homogeneous(extract_centers(f1)).cast<double>()  //
+      };
+      // List the matches as a 2D-tensor where each row encodes a match 'm' as a
+      // pair of point indices (i, j).
+      const auto M = to_tensor(matches);
+
+      const auto X = PointCorrespondenceList{M, u[0], u[1]};
+      auto data_normalizer =
+          std::make_optional(Normalizer<TwoViewGeometry>{_K, _K});
+
+      return v2::ransac(X, _solver, _inlier_predicate, ransac_iterations_max,
+                        ransac_confidence, data_normalizer, true);
+    }
+  };
+
+}  // namespace DO::Sara::v2
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
new file mode 100644
index 000000000..a77edd418
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
@@ -0,0 +1,77 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2023-present David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+
+#include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
+
+#include <DO/Sara/FeatureDetectors/SIFT.hpp>
+#include <DO/Sara/Features/KeypointList.hpp>
+#include <DO/Sara/Logging/Logger.hpp>
+
+
+using namespace DO::Sara;
+
+auto CameraPoseGraph::detect_keypoints(
+    const v2::FeatureTracker& feature_tracker,
+    const ImageView<float>& image,  //
+    const int frame_index) -> void
+
+{
+  auto& logger = Logger::get();
+
+  SARA_LOGI(logger, "Detecting keypoints for image frame {}", frame_index);
+
+  // Grow the pose graph by creating a new camera vertex.
+  const auto v = boost::add_vertex(_g);
+
+  // Store the camera pose data.
+  auto& camera_pose_data = _g[v];
+  camera_pose_data.frame_index = frame_index;
+  camera_pose_data.keypoints = compute_sift_keypoints(image);
+
+  const auto& f = features(camera_pose_data.keypoints);
+  SARA_LOGI(logger, "Camera vertex: {} keypoints", f.size());
+}
+
+auto CameraPoseGraph::estimate_relative_motion(
+    const v2::FeatureTracker& feature_tracker,                 //
+    const v2::RelativePoseEstimator& relative_pose_estimator,  //
+    const Vertex u, const Vertex v) -> void
+{
+  auto& logger = Logger::get();
+
+  SARA_LOGI(logger, "Match features...");
+  const auto& src_keys = _g[u].keypoints;
+  const auto& dst_keys = _g[v].keypoints;
+  auto matches = feature_tracker.match_features(src_keys, dst_keys);
+  if (matches.empty())
+    return;
+
+  SARA_LOGI(logger, "Estimating relative pose...");
+  auto [geometry, inliers, sample_best] =
+      relative_pose_estimator.estimate_relative_pose(src_keys, dst_keys,
+                                                     matches);
+  const auto num_inliers = inliers.flat_array().count();
+  SARA_LOGI(logger, "inlier count: {}", num_inliers);
+
+  const auto success = num_inliers > 100;
+  auto e = Edge{};
+  auto edge_added = false;
+  if (success)
+  {
+    std::tie(e, edge_added) = boost::add_edge(u, v, _g);
+    auto& relative_motion_data = _g[e];
+    relative_motion_data.matches = std::move(matches);
+    relative_motion_data.inliers = std::move(inliers);
+    relative_motion_data.src_camera = u;
+    relative_motion_data.dst_camera = v;
+  }
+}
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
index 7d90fb4fa..aa7b4f99b 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
@@ -11,23 +11,33 @@
 
 #pragma once
 
-#include <DO/Sara/Defines.hpp>
-
 #include <DO/Sara/Core/Image.hpp>
-#include <DO/Sara/Core/Pixel.hpp>
-#include <DO/Sara/MultiViewGeometry/Geometry/QuaternionBasedPose.hpp>
+#include <DO/Sara/Features/Feature.hpp>
+#include <DO/Sara/Features/KeypointList.hpp>
 #include <DO/Sara/MultiViewGeometry/Geometry/EssentialMatrix.hpp>
-#include <DO/Sara/SfM/Graph/ImageFeatures.hpp>
+#include <DO/Sara/MultiViewGeometry/Geometry/QuaternionBasedPose.hpp>
 
-#include <boost/graph/adjacency_list.hpp>
+#include <DO/Sara/SfM/BuildingBlocks/v2/FeatureTracker.hpp>
+#include <DO/Sara/SfM/BuildingBlocks/v2/RelativePoseEstimator.hpp>
 
-#include <optional>
-#include <vector>
+#include <boost/graph/adjacency_list.hpp>
 
 
 namespace DO::Sara {
 
-  struct RelativePoseEdge
+  struct CameraPoseData
+  {
+    //! @brief The corresponding image frame index.
+    int frame_index;
+
+    //! @brief The keypoints detected in the image.
+    KeypointList<OERegion, float> keypoints;
+
+    //! @brief "Absolute" pose w.r.t. some reference frame.
+    QuaternionBasedPose<double> pose;
+  };
+
+  struct RelativeMotionData
   {
     using camera_id_t = int;
     static constexpr auto undefined_camera_id = -1;
@@ -35,22 +45,35 @@ namespace DO::Sara {
     camera_id_t src_camera = undefined_camera_id;
     camera_id_t dst_camera = undefined_camera_id;
 
-    std::vector<std::pair<int, int>> _matches;
-    std::vector<std::uint8_t> _inliers;
-    Motion _motion;
+    std::vector<Match> matches;
+    Tensor_<bool, 1> inliers;
+
+    Motion motion;
   };
 
-  struct CameraPoseGraph
+  class CameraPoseGraph
   {
+  public:
     using GraphImpl =
         boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS,
-                              QuaternionBasedPose<double>, RelativePoseEdge>;
+                              CameraPoseData, RelativeMotionData>;
+
+  public:
     using Vertex = boost::graph_traits<GraphImpl>::vertex_descriptor;
     using Edge = boost::graph_traits<GraphImpl>::edge_descriptor;
 
-    GraphImpl _pose_graph;
-    std::vector<std::optional<Image<Rgb8>>> _images;
-    ImageKeypoints _image_keypoints;
+    auto detect_keypoints(const v2::FeatureTracker& feature_tracker,
+                          const ImageView<float>& image,  //
+                          const int frame_index) -> void;
+
+    auto estimate_relative_motion(
+        const v2::FeatureTracker& feature_tracker,                 //
+        const v2::RelativePoseEstimator& relative_pose_estimator,  //
+        const Vertex src, const Vertex dst) -> void;
+
+  private:
+    //! @brief The graph data structure shortened as g.
+    GraphImpl _g;
   };
 
 } /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureGID.hpp b/cpp/src/DO/Sara/SfM/Graph/FeatureGID.hpp
index 96baa8ca3..afd29e130 100644
--- a/cpp/src/DO/Sara/SfM/Graph/FeatureGID.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/FeatureGID.hpp
@@ -12,6 +12,7 @@
 #pragma once
 
 #include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
+
 #include <utility>
 
 
diff --git a/cpp/src/DO/Sara/SfM/Graph/ImageFeatures.hpp b/cpp/src/DO/Sara/SfM/Graph/ImageFeatures.hpp
deleted file mode 100644
index 83c2ae340..000000000
--- a/cpp/src/DO/Sara/SfM/Graph/ImageFeatures.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#pragma once
-
-#include <DO/Sara/Core/Tensor.hpp>
-#include <DO/Sara/Features/Feature.hpp>
-#include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
-
-#include <span>
-
-
-namespace DO::Sara {
-
-  struct ImageKeypoints
-  {
-    auto num_images() const -> int
-    {
-      return _num_images;
-    };
-
-    auto descriptor_dimension() const -> int
-    {
-      return _descriptor_dimension;
-    }
-
-    auto features(const std::size_t camera_vertex) const
-        -> std::span<const OERegion>;
-
-    auto descriptors(const std::size_t camera_vertex) const
-        -> TensorView_<const float, 2>;
-
-    //! @brief Feature data.
-    //! @{
-    std::vector<std::vector<OERegion>> _features;
-    std::vector<Eigen::MatrixXf> _descriptors;
-    //! @}
-
-    int _descriptor_dimension = -1;
-    int _num_images = -1;
-  };
-
-}  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/UseDOSaraSfM.cmake b/cpp/src/DO/Sara/UseDOSaraSfM.cmake
index d85d4798f..53c63eec4 100644
--- a/cpp/src/DO/Sara/UseDOSaraSfM.cmake
+++ b/cpp/src/DO/Sara/UseDOSaraSfM.cmake
@@ -14,7 +14,8 @@ if(SARA_USE_FROM_SOURCE)
     target_link_libraries(
       DO_Sara_SfM
       PRIVATE tinyply Boost::filesystem
-      PUBLIC DO::Sara::Features
+      PUBLIC DO::Sara::Logging
+             DO::Sara::Features
              DO::Sara::FeatureDetectors
              DO::Sara::FeatureDescriptors
              DO::Sara::FeatureMatching

From 0d8555e6ca00c04fc5abe475fb6515f818ebdd01 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Mon, 8 Apr 2024 20:04:54 +0100
Subject: [PATCH 27/49] WIP: save work.

WIP: save work.

MAINT: reorganize files.

MAINT: reorganize files and trim number of functions.

MAINT: fix compile errors.

MAINT: clean up code.

MAINT: clean up code.

MAINT: fix compile errors.

MAINT: refactor code.

WIP: clean up code.

WIP: save work.

WIP: save work.

WIP: save work.
---
 .../Sara/MultiViewGeometry/CMakeLists.txt     |   7 +-
 .../essential_5_point_example.cpp             |  10 +-
 .../fundamental_7_point_example.cpp           |   5 +-
 .../relative_pose_estimation_example.cpp      |  10 +-
 .../two_view_bundle_adjustment_example.cpp    |  13 +-
 .../visual_odometry_example_v2.cpp            | 381 ++++++++++++++++++
 .../SIFT/V2/halide_sift_pyramid_example.cpp   |   2 +-
 .../BundleAdjustmentProblem.hpp               |   2 +-
 .../Sara/MultiViewGeometry/FeatureGraph.cpp   | 311 --------------
 .../{ => Graph}/EpipolarGraph.cpp             |   3 +-
 .../{ => Graph}/EpipolarGraph.hpp             |   0
 .../MultiViewGeometry/Graph/FeatureGraph.cpp  | 318 +++++++++++++++
 .../{ => Graph}/FeatureGraph.hpp              |   2 +-
 cpp/src/DO/Sara/MultiViewGeometry/HDF5.hpp    |   2 +-
 .../BundleAdjuster.hpp                        |   0
 .../EssentialMatrixEstimation.cpp             | 345 ----------------
 .../Sara/SfM/BuildingBlocks/FeatureParams.hpp |  16 +
 .../FundamentalMatrixEstimation.cpp           | 312 --------------
 .../SfM/BuildingBlocks/KeypointDetection.cpp  |  91 -----
 .../SfM/BuildingBlocks/KeypointMatching.cpp   | 109 -----
 .../PointCloudManipulator.hpp                 |   2 +-
 .../BuildingBlocks/RelativePoseEstimator.cpp  |  59 +++
 .../BuildingBlocks/RelativePoseEstimator.hpp  |  41 ++
 .../RgbColoredPoint.hpp                       |   0
 .../SfM/BuildingBlocks/v2/FeatureTracker.hpp  |  39 --
 .../v2/RelativePoseEstimator.hpp              |  80 ----
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp |  36 +-
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp |  84 ++--
 .../DO/Sara/SfM/Graph/FeatureDisjointSets.hpp |  57 +++
 cpp/src/DO/Sara/SfM/Graph/FeatureGID.hpp      |  41 --
 cpp/src/DO/Sara/SfM/Graph/FeatureGraph.cpp    |   6 +-
 cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp    |  73 +++-
 cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp  | 189 +++++++++
 cpp/src/DO/Sara/SfM/Graph/FeatureTracker.hpp  |  53 +++
 .../SfM/{BuildingBlocks.hpp => Helpers.hpp}   |   9 +-
 .../SfM/Helpers/EssentialMatrixEstimation.cpp |  55 +++
 .../EssentialMatrixEstimation.hpp             |  16 +-
 .../Helpers/FundamentalMatrixEstimation.cpp   | 122 ++++++
 .../FundamentalMatrixEstimation.hpp           |  14 +-
 .../KeypointMatching.cpp}                     |  26 +-
 .../KeypointMatching.hpp                      |   6 -
 .../Triangulation.cpp                         |   2 +-
 .../Triangulation.hpp                         |   0
 .../DO/Sara/SfM/Odometry/FeatureTracker.hpp   |   2 +-
 .../SfM/Odometry/RelativePoseEstimator.hpp    |   2 +-
 cpp/src/DO/Sara/SfM/Odometry/Triangulator.hpp |   2 +-
 .../DO/Sara/SfM/Odometry/VideoStreamer.hpp    |   5 +
 .../Sara/SfM/OdometryV2/OdometryPipeline.cpp  | 120 ++++++
 .../Sara/SfM/OdometryV2/OdometryPipeline.hpp  |  76 ++++
 .../test_multiviewgeometry_feature_graph.cpp  |  31 +-
 50 files changed, 1697 insertions(+), 1490 deletions(-)
 create mode 100644 cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
 delete mode 100644 cpp/src/DO/Sara/MultiViewGeometry/FeatureGraph.cpp
 rename cpp/src/DO/Sara/MultiViewGeometry/{ => Graph}/EpipolarGraph.cpp (99%)
 rename cpp/src/DO/Sara/MultiViewGeometry/{ => Graph}/EpipolarGraph.hpp (100%)
 create mode 100644 cpp/src/DO/Sara/MultiViewGeometry/Graph/FeatureGraph.cpp
 rename cpp/src/DO/Sara/MultiViewGeometry/{ => Graph}/FeatureGraph.hpp (98%)
 rename cpp/src/DO/Sara/SfM/{Graph => BuildingBlocks}/BundleAdjuster.hpp (100%)
 delete mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.cpp
 create mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/FeatureParams.hpp
 delete mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.cpp
 delete mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointDetection.cpp
 delete mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointMatching.cpp
 rename cpp/src/DO/Sara/SfM/{Graph => BuildingBlocks}/PointCloudManipulator.hpp (95%)
 create mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.cpp
 create mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp
 rename cpp/src/DO/Sara/SfM/{Graph => BuildingBlocks}/RgbColoredPoint.hpp (100%)
 delete mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/v2/FeatureTracker.hpp
 delete mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/v2/RelativePoseEstimator.hpp
 create mode 100644 cpp/src/DO/Sara/SfM/Graph/FeatureDisjointSets.hpp
 delete mode 100644 cpp/src/DO/Sara/SfM/Graph/FeatureGID.hpp
 create mode 100644 cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp
 create mode 100644 cpp/src/DO/Sara/SfM/Graph/FeatureTracker.hpp
 rename cpp/src/DO/Sara/SfM/{BuildingBlocks.hpp => Helpers.hpp} (62%)
 create mode 100644 cpp/src/DO/Sara/SfM/Helpers/EssentialMatrixEstimation.cpp
 rename cpp/src/DO/Sara/SfM/{BuildingBlocks => Helpers}/EssentialMatrixEstimation.hpp (59%)
 create mode 100644 cpp/src/DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.cpp
 rename cpp/src/DO/Sara/SfM/{BuildingBlocks => Helpers}/FundamentalMatrixEstimation.hpp (73%)
 rename cpp/src/DO/Sara/SfM/{BuildingBlocks/KeypointDetection.hpp => Helpers/KeypointMatching.cpp} (58%)
 rename cpp/src/DO/Sara/SfM/{BuildingBlocks => Helpers}/KeypointMatching.hpp (82%)
 rename cpp/src/DO/Sara/SfM/{BuildingBlocks => Helpers}/Triangulation.cpp (99%)
 rename cpp/src/DO/Sara/SfM/{BuildingBlocks => Helpers}/Triangulation.hpp (100%)
 create mode 100644 cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
 create mode 100644 cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp

diff --git a/cpp/examples/Sara/MultiViewGeometry/CMakeLists.txt b/cpp/examples/Sara/MultiViewGeometry/CMakeLists.txt
index 91fb31c21..258c5278a 100644
--- a/cpp/examples/Sara/MultiViewGeometry/CMakeLists.txt
+++ b/cpp/examples/Sara/MultiViewGeometry/CMakeLists.txt
@@ -49,8 +49,11 @@ target_link_libraries(essential_5_point_example PRIVATE tinyply)
 # Visual odometry.
 sara_add_example(visual_odometry_example)
 target_link_libraries(visual_odometry_example PRIVATE DO::Kalpana::EasyGL #
-                                                      fmt::fmt
-                                                      glfw)
+                                                      fmt::fmt glfw)
+
+sara_add_example(visual_odometry_example_v2)
+target_link_libraries(visual_odometry_example_v2 PRIVATE DO::Kalpana::EasyGL #
+                                                         fmt::fmt glfw)
 
 # Bundle adjustment.
 sara_add_example(two_view_bundle_adjustment_example)
diff --git a/cpp/examples/Sara/MultiViewGeometry/essential_5_point_example.cpp b/cpp/examples/Sara/MultiViewGeometry/essential_5_point_example.cpp
index e5dafa087..8094bfed0 100644
--- a/cpp/examples/Sara/MultiViewGeometry/essential_5_point_example.cpp
+++ b/cpp/examples/Sara/MultiViewGeometry/essential_5_point_example.cpp
@@ -16,14 +16,14 @@
 #include <DO/Sara/FeatureDetectors/SIFT.hpp>
 #include <DO/Sara/Graphics.hpp>
 #include <DO/Sara/ImageIO.hpp>
-#include <DO/Sara/MultiViewGeometry/EpipolarGraph.hpp>
+#include <DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.hpp>
 #include <DO/Sara/MultiViewGeometry/Miscellaneous.hpp>
 #include <DO/Sara/RANSAC/RANSAC.hpp>
 
-#include <DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/Triangulation.hpp>
+#include <DO/Sara/SfM/Helpers/EssentialMatrixEstimation.hpp>
+#include <DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.hpp>
+#include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
+#include <DO/Sara/SfM/Helpers/Triangulation.hpp>
 
 #include <filesystem>
 
diff --git a/cpp/examples/Sara/MultiViewGeometry/fundamental_7_point_example.cpp b/cpp/examples/Sara/MultiViewGeometry/fundamental_7_point_example.cpp
index 36943ae7f..888fc6f5a 100644
--- a/cpp/examples/Sara/MultiViewGeometry/fundamental_7_point_example.cpp
+++ b/cpp/examples/Sara/MultiViewGeometry/fundamental_7_point_example.cpp
@@ -21,7 +21,7 @@
 #include <DO/Sara/RANSAC/RANSAC.hpp>
 #include <DO/Sara/Visualization.hpp>
 
-#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
+#include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
 
 
 using namespace std;
@@ -89,7 +89,8 @@ auto estimate_fundamental_matrix(const KeypointList<OERegion, float>& keys1,
   const auto data_normalizer =
       std::make_optional(Normalizer<FundamentalMatrix>{X});
 
-  auto inlier_predicate = InlierPredicate<SymmetricEpipolarGeometricSquaredDistance>{};
+  auto inlier_predicate =
+      InlierPredicate<SymmetricEpipolarGeometricSquaredDistance>{};
   inlier_predicate.err_threshold = f_err_thres;
 
   const auto [F, inliers, sample_best] =
diff --git a/cpp/examples/Sara/MultiViewGeometry/relative_pose_estimation_example.cpp b/cpp/examples/Sara/MultiViewGeometry/relative_pose_estimation_example.cpp
index 163e553f3..3a5da0b7f 100644
--- a/cpp/examples/Sara/MultiViewGeometry/relative_pose_estimation_example.cpp
+++ b/cpp/examples/Sara/MultiViewGeometry/relative_pose_estimation_example.cpp
@@ -23,16 +23,16 @@
 #include <DO/Sara/FeatureDetectors/SIFT.hpp>
 #include <DO/Sara/Graphics.hpp>
 #include <DO/Sara/ImageIO.hpp>
-#include <DO/Sara/MultiViewGeometry/EpipolarGraph.hpp>
+#include <DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.hpp>
 #include <DO/Sara/MultiViewGeometry/MinimalSolvers/InlierPredicates.hpp>
 #include <DO/Sara/MultiViewGeometry/MinimalSolvers/RelativePoseSolver.hpp>
 #include <DO/Sara/MultiViewGeometry/Miscellaneous.hpp>
 #include <DO/Sara/RANSAC/RANSAC.hpp>
 
-#include <DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/Triangulation.hpp>
+#include <DO/Sara/SfM/Helpers/EssentialMatrixEstimation.hpp>
+#include <DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.hpp>
+#include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
+#include <DO/Sara/SfM/Helpers/Triangulation.hpp>
 
 
 using namespace std::string_literals;
diff --git a/cpp/examples/Sara/MultiViewGeometry/two_view_bundle_adjustment_example.cpp b/cpp/examples/Sara/MultiViewGeometry/two_view_bundle_adjustment_example.cpp
index a2e7eefec..539b6920e 100644
--- a/cpp/examples/Sara/MultiViewGeometry/two_view_bundle_adjustment_example.cpp
+++ b/cpp/examples/Sara/MultiViewGeometry/two_view_bundle_adjustment_example.cpp
@@ -15,15 +15,14 @@
 #include <DO/Sara/Graphics.hpp>
 #include <DO/Sara/ImageIO.hpp>
 #include <DO/Sara/MultiViewGeometry/BundleAdjustmentProblem.hpp>
-#include <DO/Sara/MultiViewGeometry/EpipolarGraph.hpp>
-#include <DO/Sara/MultiViewGeometry/FeatureGraph.hpp>
+#include <DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.hpp>
+#include <DO/Sara/MultiViewGeometry/Graph/FeatureGraph.hpp>
 #include <DO/Sara/MultiViewGeometry/Miscellaneous.hpp>
 #include <DO/Sara/RANSAC/RANSAC.hpp>
-
-#include <DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/Triangulation.hpp>
+#include <DO/Sara/SfM/Helpers/EssentialMatrixEstimation.hpp>
+#include <DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.hpp>
+#include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
+#include <DO/Sara/SfM/Helpers/Triangulation.hpp>
 
 #if defined(_WIN32)
 #  pragma warning(push, 0)
diff --git a/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp b/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
new file mode 100644
index 000000000..1c7f50309
--- /dev/null
+++ b/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
@@ -0,0 +1,381 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2023 David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#include <DO/Kalpana/EasyGL.hpp>
+#include <DO/Kalpana/EasyGL/Objects/ColoredPointCloud.hpp>
+#include <DO/Kalpana/EasyGL/Objects/TexturedImage.hpp>
+#include <DO/Kalpana/EasyGL/Objects/TexturedQuad.hpp>
+#include <DO/Kalpana/EasyGL/Renderer/ColoredPointCloudRenderer.hpp>
+#include <DO/Kalpana/EasyGL/Renderer/TextureRenderer.hpp>
+#include <DO/Kalpana/Math/Projection.hpp>
+#include <DO/Kalpana/Math/Viewport.hpp>
+
+#include <DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp>
+
+#if defined(_WIN32)
+#  include <windows.h>
+#endif
+
+#include <GLFW/glfw3.h>
+
+#include <fmt/format.h>
+
+#include <string_view>
+
+#if defined(_OPENMP)
+#  include <omp.h>
+#endif
+
+
+namespace fs = std::filesystem;
+namespace sara = DO::Sara;
+namespace k = DO::Kalpana;
+namespace kgl = DO::Kalpana::GL;
+
+
+class SingleWindowApp
+{
+public:
+  SingleWindowApp(const Eigen::Vector2i& sizes, const std::string& title)
+  {
+    // Init GLFW.
+    init_glfw();
+
+    // Create a window.
+    _window = glfwCreateWindow(sizes.x(), sizes.y(),  //
+                               title.c_str(),         //
+                               nullptr, nullptr);
+
+    _fb_sizes = get_framebuffer_sizes();
+
+    // Initialize the point cloud viewport.
+    _point_cloud_viewport.top_left().setZero();
+    _point_cloud_viewport.sizes() << _fb_sizes.x() / 2, _fb_sizes.y();
+
+    // Initialize the video viewport.
+    _video_viewport.top_left() << _fb_sizes.x() / 2, 0;
+    _video_viewport.sizes() << _fb_sizes.x() / 2, _fb_sizes.y();
+
+    // Prepare OpenGL first before any OpenGL calls.
+    init_opengl();
+
+    // The magic function.
+    glfwSetWindowUserPointer(_window, this);
+    // Register callbacks.
+    glfwSetWindowSizeCallback(_window, window_size_callback);
+  }
+
+  ~SingleWindowApp()
+  {
+    // Destroy GL objects.
+    deinit_gl_resources();
+
+    // Destroy GLFW.
+    if (_window != nullptr)
+      glfwDestroyWindow(_window);
+
+    if (_glfw_initialized)
+      glfwTerminate();
+  }
+
+  auto set_config(const fs::path& video_path,
+                  const sara::v2::BrownConradyDistortionModel<double>& camera)
+      -> void
+  {
+    _pipeline.set_config(video_path, camera);
+    init_gl_resources();
+  }
+
+  auto run() -> void
+  {
+    // Current projection matrix
+    _projection = _video_viewport.orthographic_projection();
+    _point_cloud_projection = _point_cloud_viewport.orthographic_projection();
+
+    // Background color.
+    glClearColor(0.2f, 0.3f, 0.3f, 1.0f);
+
+    glEnable(GL_BLEND);
+    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+    glEnable(GL_PROGRAM_POINT_SIZE);
+
+    // You absolutely need this for 3D objects!
+    glEnable(GL_DEPTH_TEST);
+
+    // Display image.
+    glfwSwapInterval(1);
+    while (!glfwWindowShouldClose(_window))
+    {
+      if (!_pipeline.read())
+        break;
+
+      _pipeline.process();
+      // Load data to OpenGL.
+      upload_point_cloud_data_to_opengl();
+
+      // Clear the color buffer and the buffer testing.
+      glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+      render_video();
+      render_point_cloud();
+
+      glfwSwapBuffers(_window);
+      glfwPollEvents();
+    }
+  }
+
+private:
+  auto init_opengl() -> void
+  {
+    glfwMakeContextCurrent(_window);
+    init_glew();
+  }
+
+  auto init_gl_resources() -> void
+  {
+    // Video texture rendering
+    _texture.initialize(_pipeline._video_streamer.frame_rgb8(), 0);
+
+    const auto& w = _pipeline._video_streamer.width();
+    const auto& h = _pipeline._video_streamer.height();
+    const auto aspect_ratio = static_cast<float>(w) / h;
+    auto vertices = _quad.host_vertices().matrix();
+    vertices.col(0) *= aspect_ratio;
+    _quad.initialize();
+
+    _texture_renderer.initialize();
+
+    // Point cloud rendering
+    _point_cloud.initialize();
+    _point_cloud_renderer.initialize();
+  }
+
+  auto deinit_gl_resources() -> void
+  {
+    _texture.destroy();
+    _quad.destroy();
+    _texture_renderer.destroy();
+
+    _point_cloud.destroy();
+    _point_cloud_renderer.destroy();
+  }
+
+  auto upload_point_cloud_data_to_opengl() -> void
+  {
+    // _point_cloud.upload_host_data_to_gl(
+    //     _pipeline._triangulator->_colored_point_cloud);
+  }
+
+  auto render_video() -> void
+  {
+    // Render on the right half of the window surface.
+    glViewport(_video_viewport.x(), _video_viewport.y(),  //
+               _video_viewport.width(), _video_viewport.height());
+    // Transfer the CPU image frame data to the OpenGL texture.
+    // _texture.reset(_pipeline._video_stream.frame_rgb8());
+    _texture.reset(_pipeline.make_display_frame());
+    // Render the texture on the quad.
+    auto model_view = Eigen::Transform<float, 3, Eigen::Projective>{};
+    model_view.setIdentity();
+    _texture_renderer.render(_texture, _quad, model_view.matrix(), _projection);
+  }
+
+  auto render_point_cloud() -> void
+  {
+    glViewport(_point_cloud_viewport.x(), _point_cloud_viewport.y(),
+               _point_cloud_viewport.width(), _point_cloud_viewport.height());
+
+    // CAVEAT: re-express the point cloud in OpenGL axis convention.
+    auto from_cam_to_gl = Eigen::Transform<float, 3, Eigen::Projective>{};
+    from_cam_to_gl.setIdentity();
+    // clang-format off
+    from_cam_to_gl.matrix().topLeftCorner<3, 3>() <<
+      1,  0,  0,
+      0, -1,  0,
+      0,  0, -1;
+    // clang-format on
+
+    // Update the model view matrix.
+    const Eigen::Matrix4f model_view = Eigen::Matrix4f::Identity();
+
+    // Render the point cloud.
+    _point_cloud_renderer.render(_point_cloud, _point_size,
+                                 from_cam_to_gl.matrix(),  //
+                                 model_view, _point_cloud_projection);
+  }
+
+  auto get_framebuffer_sizes() const -> Eigen::Vector2i
+  {
+    auto sizes = Eigen::Vector2i{};
+    glfwGetFramebufferSize(_window, &sizes.x(), &sizes.y());
+    return sizes;
+  }
+
+private:
+  static auto get_self(GLFWwindow* const window) -> SingleWindowApp&
+  {
+    const auto app_void_ptr = glfwGetWindowUserPointer(window);
+    if (app_void_ptr == nullptr)
+      throw std::runtime_error{
+          "Please call glfwSetWindowUserPointer to register this window!"};
+    const auto app_ptr = reinterpret_cast<SingleWindowApp*>(app_void_ptr);
+    return *app_ptr;
+  }
+
+  static auto window_size_callback(GLFWwindow* window, const int, const int)
+      -> void
+  {
+    auto& self = get_self(window);
+
+    auto& fb_sizes = self._fb_sizes;
+    fb_sizes = self.get_framebuffer_sizes();
+
+    // Point cloud viewport rectangle geometry.
+    self._point_cloud_viewport.top_left().setZero();
+    self._point_cloud_viewport.sizes() << fb_sizes.x() / 2, fb_sizes.y();
+
+    // Video viewport rectangle geometry.
+    self._video_viewport.top_left() << fb_sizes.x() / 2, 0;
+    self._video_viewport.sizes() << fb_sizes.x() / 2, fb_sizes.y();
+
+    // Update the current projection matrices.
+    auto scale = 0.5f;
+    if (self._video_viewport.width() < self._pipeline._video_streamer.width())
+      scale *= static_cast<float>(self._pipeline._video_streamer.width()) /
+               self._video_viewport.width();
+    self._projection = self._video_viewport.orthographic_projection(scale);
+
+    // Point cloud projection matrix.
+    self._point_cloud_projection = self._point_cloud_viewport.perspective();
+  }
+
+private:
+  static auto init_glfw() -> void
+  {
+    if (_glfw_initialized)
+      throw std::runtime_error{
+          "Error: cannot instantiate more than one GLFW application!"};
+
+    // Initialize the windows manager.
+    _glfw_initialized = glfwInit();
+    if (!_glfw_initialized)
+      throw std::runtime_error{"Error: failed to initialize GLFW!"};
+
+    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
+    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
+    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
+#if defined(__APPLE__)
+    glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
+#endif
+  }
+
+  static auto init_glew() -> void
+  {
+#if !defined(__APPLE__)
+    // Initialize GLEW.
+    const auto err = glewInit();
+    if (err != GLEW_OK)
+    {
+      const auto err_str =
+          reinterpret_cast<const char*>(glewGetErrorString(err));
+      throw std::runtime_error{fmt::format(
+          "Error: failed to initialize GLEW: {}", std::string_view{err_str})};
+    }
+#endif
+  }
+
+private:
+  static bool _glfw_initialized;
+
+  GLFWwindow* _window = nullptr;
+  //! @brief Framebuffer sizes
+  //! We want to use this and not the window sizes because of MacOS retina
+  //! display.
+  Eigen::Vector2i _fb_sizes = -Eigen::Vector2i::Ones();
+
+  sara::v2::OdometryPipeline _pipeline;
+
+  //! Video rendering
+  //!
+  //! The viewport
+  k::Viewport _video_viewport;
+  //! @brief the video texture.
+  kgl::TexturedImage2D _texture;
+  //! @brief the video quad.
+  kgl::TexturedQuad _quad;
+  //! @brief Texture renderer.
+  kgl::TextureRenderer _texture_renderer;
+  //! @brief Model-view-projection matrices.
+  Eigen::Matrix4f _projection;
+
+  //! Point cloud rendering
+  //!
+  //! @brief The viewport.
+  k::Viewport _point_cloud_viewport;
+  //! @brief Point cloud GPU data.
+  kgl::ColoredPointCloud _point_cloud;
+  //! @brief Point cloud GPU renderer.
+  kgl::ColoredPointCloudRenderer _point_cloud_renderer;
+  //! @brief Point cloud rendering options.
+  Eigen::Matrix4f _point_cloud_projection;
+  // kgl::Camera _point_cloud_camera;
+  float _point_size = 5.f;
+};
+
+bool SingleWindowApp::_glfw_initialized = false;
+
+
+auto main(int const argc, char** const argv) -> int
+{
+#if defined(_OPENMP)
+  const auto num_threads = omp_get_max_threads();
+  omp_set_num_threads(num_threads);
+  Eigen::setNbThreads(num_threads);
+#endif
+
+  if (argc < 2)
+  {
+    std::cout << fmt::format("Usage: {} VIDEO_PATH\n",
+                             std::string_view{argv[0]});
+    return EXIT_FAILURE;
+  }
+
+  const auto video_path = fs::path{argv[1]};
+  auto camera = sara::v2::BrownConradyDistortionModel<double>{};
+  {
+    camera.fx() = 917.2878392016245;
+    camera.fy() = 917.2878392016245;
+    camera.shear() = 0.;
+    camera.u0() = 960.;
+    camera.v0() = 540.;
+    // clang-format off
+    camera.k() <<
+      -0.2338367557617234,
+      0.05952465745165465,
+      -0.007947847982157091;
+    // clang-format on
+    camera.p() << -0.0003137658969742134, 0.00021943576376532096;
+  }
+
+  try
+  {
+    auto app = SingleWindowApp{{800, 600}, "Odometry: " + video_path.string()};
+    app.set_config(video_path, camera);
+    app.run();
+  }
+  catch (std::exception& e)
+  {
+    std::cerr << e.what() << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/cpp/examples/Shakti/Halide/SIFT/V2/halide_sift_pyramid_example.cpp b/cpp/examples/Shakti/Halide/SIFT/V2/halide_sift_pyramid_example.cpp
index 1c3c0c0ae..79fbfb429 100644
--- a/cpp/examples/Shakti/Halide/SIFT/V2/halide_sift_pyramid_example.cpp
+++ b/cpp/examples/Shakti/Halide/SIFT/V2/halide_sift_pyramid_example.cpp
@@ -17,7 +17,7 @@
 #include <DO/Sara/Features.hpp>
 #include <DO/Sara/Graphics.hpp>
 #include <DO/Sara/ImageIO.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.hpp>
+#include <DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.hpp>
 #include <DO/Sara/Visualization.hpp>
 
 #include <DO/Shakti/Halide/SIFT/Draw.hpp>
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/BundleAdjustmentProblem.hpp b/cpp/src/DO/Sara/MultiViewGeometry/BundleAdjustmentProblem.hpp
index 92ee6f733..c296da689 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/BundleAdjustmentProblem.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/BundleAdjustmentProblem.hpp
@@ -12,7 +12,7 @@
 #pragma once
 
 #include <DO/Sara/Core/Tensor.hpp>
-#include <DO/Sara/MultiViewGeometry/FeatureGraph.hpp>
+#include <DO/Sara/MultiViewGeometry/Graph/FeatureGraph.hpp>
 
 #include <vector>
 
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/FeatureGraph.cpp b/cpp/src/DO/Sara/MultiViewGeometry/FeatureGraph.cpp
deleted file mode 100644
index 41bc34dcb..000000000
--- a/cpp/src/DO/Sara/MultiViewGeometry/FeatureGraph.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-// ========================================================================== //
-// This file is part of Sara, a basic set of libraries in C++ for computer
-// vision.
-//
-// Copyright (C) 2019 David Ok <david.ok8@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License v. 2.0. If a copy of the MPL was not distributed with this file,
-// you can obtain one at http://mozilla.org/MPL/2.0/.
-// ========================================================================== //
-
-#include <DO/Sara/Features/KeypointList.hpp>
-#include <DO/Sara/MultiViewGeometry/DataTransformations.hpp>
-#include <DO/Sara/MultiViewGeometry/FeatureGraph.hpp>
-
-
-namespace DO::Sara {
-
-auto populate_feature_gids(
-    const std::vector<KeypointList<OERegion, float>>& keypoints)
-    -> std::vector<FeatureGID>
-{
-  const auto image_ids = range(static_cast<int>(keypoints.size()));
-
-  auto populate_gids = [&](auto image_id) {
-    const auto num_features =
-        static_cast<int>(features(keypoints[image_id]).size());
-    auto lids = range(num_features);
-    auto gids = std::vector<FeatureGID>(lids.size());
-    std::transform(std::begin(lids), std::end(lids), std::begin(gids),
-                   [&](auto lid) -> FeatureGID {
-                     return {image_id, lid};
-                   });
-    return gids;
-  };
-
-  const auto gids =
-      std::accumulate(std::begin(image_ids), std::end(image_ids),  //
-                      std::vector<FeatureGID>{},                   //
-                      [&](const auto& gids, const auto image_id) {
-                        auto gids_union = gids;
-                        ::append(gids_union, populate_gids(image_id));
-                        return gids_union;
-                      });
-
-  return gids;
-}
-
-auto calculate_feature_id_offsets(
-    const std::vector<KeypointList<OERegion, float>>& keypoints)
-    -> std::vector<int>
-{
-  auto fid_offsets = std::vector<int>(keypoints.size(), 0);
-  std::transform(std::begin(keypoints), std::end(keypoints) - 1,
-                 std::begin(fid_offsets) + 1, [](const auto& keypoints) {
-                   return static_cast<int>(features(keypoints).size());
-                 });
-
-  std::partial_sum(std::begin(fid_offsets), std::end(fid_offsets),
-                   std::begin(fid_offsets));
-
-  return fid_offsets;
-}
-
-auto populate_feature_tracks(const ViewAttributes& view_attributes,
-                             const EpipolarEdgeAttributes& epipolar_edges)
-    -> std::pair<FeatureGraph, std::vector<std::vector<int>>>
-{
-  const auto& keypoints = view_attributes.keypoints;
-
-  const auto gids = populate_feature_gids(keypoints);
-  const auto num_keypoints = gids.size();
-
-  // Populate the vertices.
-  const auto feature_ids = range(static_cast<int>(num_keypoints));
-  auto graph = FeatureGraph{num_keypoints};
-  // Fill the GID attribute for each vertex.
-  std::for_each(std::begin(feature_ids), std::end(feature_ids),
-                [&](auto v) { graph[v] = gids[v]; });
-
-  const auto feature_id_offset = calculate_feature_id_offsets(keypoints);
-
-  // Incremental connected components.
-  using ICC = IncrementalConnectedComponentsHelper;
-  auto rank = ICC::initialize_ranks(graph);
-  auto parent = ICC::initialize_parents(graph);
-  auto ds = ICC::initialize_disjoint_sets(rank, parent);
-  ICC::initialize_incremental_components(graph, ds);
-
-  auto add_edge = [&](auto u, auto v) {
-    boost::add_edge(u, v, graph);
-    ds.union_set(u, v);
-  };
-
-  const auto& edge_ids = epipolar_edges.edge_ids;
-  const auto& edges = epipolar_edges.edges;
-  const auto& matches = epipolar_edges.matches;
-  const auto& E_inliers = epipolar_edges.E_inliers;
-  const auto& two_view_geometries = epipolar_edges.two_view_geometries;
-
-  // Populate the edges.
-  std::for_each(std::begin(edge_ids), std::end(edge_ids), [&](const auto& ij) {
-    const auto& eij = edges[ij];
-    const auto i = eij.first;
-    const auto j = eij.second;
-    const auto& Mij = matches[ij];
-    const auto& inliers_ij = E_inliers[ij];
-    const auto& cheirality_ij = two_view_geometries[ij].cheirality;
-
-    std::cout << std::endl;
-    SARA_DEBUG << "Processing image pair " << i << " " << j << std::endl;
-
-    SARA_DEBUG << "Checking if there are inliers..." << std::endl;
-    SARA_CHECK(cheirality_ij.count());
-    SARA_CHECK(inliers_ij.flat_array().count());
-    if (inliers_ij.flat_array().count() == 0)
-      return;
-
-    SARA_DEBUG << "Calculating cheiral inliers..." << std::endl;
-    SARA_CHECK(cheirality_ij.size());
-    SARA_CHECK(inliers_ij.size());
-    static_assert(std::is_same_v<decltype(inliers_ij.size()), std::size_t>);
-    if (static_cast<std::size_t>(cheirality_ij.size()) != inliers_ij.size())
-      throw std::runtime_error{"cheirality_ij.size() != inliers_ij.size()"};
-
-    const Array<bool, 1, Dynamic> cheiral_inliers =
-        inliers_ij.row_vector().array() && cheirality_ij;
-    SARA_CHECK(cheiral_inliers.size());
-    SARA_CHECK(cheiral_inliers.count());
-
-    // Convert each match 'm' to a pair of point indices '(p, q)'.
-    SARA_DEBUG << "Transforming matches..." << std::endl;
-    const auto pq_tensor = to_tensor(Mij);
-    SARA_CHECK(Mij.size());
-    SARA_CHECK(pq_tensor.size(0));
-
-    if (pq_tensor.empty())
-      return;
-
-    SARA_DEBUG << "Updating disjoint sets..." << std::endl;
-    for (int m = 0; m < pq_tensor.size(0); ++m)
-    {
-      if (!cheiral_inliers(m))
-        continue;
-
-      const auto p = pq_tensor(m, 0);
-      const auto q = pq_tensor(m, 1);
-
-      const auto &p_off = feature_id_offset[i];
-      const auto &q_off = feature_id_offset[j];
-
-      const auto vp = p_off + p;
-      const auto vq = q_off + q;
-
-      // Runtime checks.
-      if (graph[vp].image_id != i)
-        throw std::runtime_error{"image_id[vp] != i"};
-      if (graph[vp].local_id != p)
-        throw std::runtime_error{"local_id[vp] != p"};
-
-      if (graph[vq].image_id != j)
-        throw std::runtime_error{"image_id[vq] != j"};
-      if (graph[vq].local_id != q)
-        throw std::runtime_error{"local_id[vq] != q"};
-
-      // Update the graph and the disjoint sets.
-      add_edge(vp, vq);
-    }
-  });
-
-  // Calculate the connected components.
-  auto components = std::vector<std::vector<int>>{};
-  {
-    const auto components_tmp = ICC::get_components(parent);
-    components.resize(components_tmp.size());
-    for (auto c : components_tmp)
-      for (auto [child, child_end] = components_tmp[c]; child != child_end; ++child)
-        components[c].push_back(static_cast<int>(*child));
-  }
-
-
-  return {graph, components};
-}
-
-auto filter_feature_tracks(const FeatureGraph& graph,
-                           const std::vector<std::vector<int>>& components,
-                           ViewAttributes& views)
-    -> std::set<std::set<FeatureGID>>
-{
-  // Populate the feature tracks regardless of their cardinality.
-  auto feature_tracks = std::set<std::set<FeatureGID>>{};
-  for (const auto& component : components)
-  {
-    auto feature_track = std::set<FeatureGID>{};
-    std::transform(component.begin(), component.end(),
-                   std::inserter(feature_track, std::begin(feature_track)),
-                   [&](const auto v) { return graph[v]; });
-
-    feature_tracks.insert(feature_track);
-  }
-
-  // Remove redundant features across images.
-  using ImageID = int;
-  using FeatureID = int;
-
-  auto filtered_feature_tracks_dict = std::set<std::map<ImageID, FeatureID>>{};
-  for (const auto& track : feature_tracks)
-  {
-    auto filtered_track = std::map<ImageID, FeatureID>{};
-    for (const auto& f : track)
-    {
-      const auto current_feature = filtered_track.find(f.image_id);
-      const auto current_feature_found =
-          current_feature != filtered_track.end();
-
-      if (!current_feature_found)
-        filtered_track[f.image_id] = f.local_id;
-
-      // Replace the feature if the response is stronger.
-      else  // image_id == f.image_id
-      {
-        const auto& features_list = features(views.keypoints[f.image_id]);
-
-        // The feature local IDs.
-        const auto& current_feature_id = current_feature->second;
-        const auto current_feature_response =
-            std::abs(features_list[current_feature_id].extremum_value);
-
-        const auto feature_response =
-            std::abs(features_list[f.local_id].extremum_value);
-
-        if (feature_response > current_feature_response)
-          filtered_track[f.image_id] = f.local_id;
-      }
-    }
-
-    filtered_feature_tracks_dict.insert(filtered_track);
-  }
-
-  // Transform the filtered feature tracks again.
-  auto filtered_feature_tracks = std::set<std::set<FeatureGID>>{};
-  for (const auto& track_dict : filtered_feature_tracks_dict)
-  {
-    if (track_dict.size() == 1)
-      continue;
-
-    auto track_set = std::set<FeatureGID>{};
-    for (const auto& gid : track_dict)
-      track_set.insert({gid.first, gid.second});
-    filtered_feature_tracks.insert(track_set);
-  }
-
-  // Replace the feature tracks.
-  feature_tracks.swap(filtered_feature_tracks);
-
-  return feature_tracks;
-}
-
-
-template <>
-struct CalculateH5Type<FeatureGID>
-{
-  static inline auto value() -> H5::CompType
-  {
-    auto h5_comp_type = H5::CompType{sizeof(FeatureGID)};
-    INSERT_MEMBER(h5_comp_type, FeatureGID, image_id);
-    INSERT_MEMBER(h5_comp_type, FeatureGID, local_id);
-    return h5_comp_type;
-  }
-};
-
-
-auto write_feature_graph(const FeatureGraph& graph, H5File& file,
-                                const std::string& group_name) -> void
-{
-  auto features = std::vector<FeatureGID>(boost::num_vertices(graph));
-  for (auto [v, v_end] = boost::vertices(graph); v != v_end; ++v)
-    features[*v] = {graph[*v].image_id, graph[*v].local_id};
-
-  auto matches = std::vector<Vector2i>{};
-  for (auto [e, e_end] = boost::edges(graph); e != e_end; ++e)
-    matches.push_back({boost::source(*e, graph), boost::target(*e, graph)});
-
-  file.get_group(group_name);
-  file.write_dataset(group_name + "/" + "features", tensor_view(features));
-  file.write_dataset(group_name + "/" + "matches", tensor_view(matches));
-}
-
-
-auto read_feature_graph(H5File& file, const std::string& group_name)
-    -> FeatureGraph
-{
-  auto features = std::vector<FeatureGID>{};
-  auto matches = std::vector<Vector2i>{};
-
-  file.read_dataset(group_name + "/" + "features", features);
-  file.read_dataset(group_name + "/" + "matches", matches);
-
-  // Reconstruct the graph.
-  auto g = FeatureGraph{};
-
-  for (const auto& v : features)
-    boost::add_vertex(v, g);
-
-  for (const auto& e : matches)
-    boost::add_edge(e(0), e(1), g);
-
-  return g;
-}
-
-} /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/EpipolarGraph.cpp b/cpp/src/DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.cpp
similarity index 99%
rename from cpp/src/DO/Sara/MultiViewGeometry/EpipolarGraph.cpp
rename to cpp/src/DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.cpp
index b303d61d1..45d62a58c 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/EpipolarGraph.cpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.cpp
@@ -9,11 +9,12 @@
 // you can obtain one at http://mozilla.org/MPL/2.0/.
 // ========================================================================== //
 
+#include <DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.hpp>
+
 #include <DO/Sara/Core/StringFormat.hpp>
 #include <DO/Sara/Features.hpp>
 #include <DO/Sara/FileSystem.hpp>
 #include <DO/Sara/ImageIO.hpp>
-#include <DO/Sara/MultiViewGeometry/EpipolarGraph.hpp>
 #include <DO/Sara/MultiViewGeometry/HDF5.hpp>
 
 
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/EpipolarGraph.hpp b/cpp/src/DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.hpp
similarity index 100%
rename from cpp/src/DO/Sara/MultiViewGeometry/EpipolarGraph.hpp
rename to cpp/src/DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.hpp
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/Graph/FeatureGraph.cpp b/cpp/src/DO/Sara/MultiViewGeometry/Graph/FeatureGraph.cpp
new file mode 100644
index 000000000..3b9e4c23c
--- /dev/null
+++ b/cpp/src/DO/Sara/MultiViewGeometry/Graph/FeatureGraph.cpp
@@ -0,0 +1,318 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2019 David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#include <DO/Sara/MultiViewGeometry/Graph/FeatureGraph.hpp>
+
+#include <DO/Sara/Features/KeypointList.hpp>
+#include <DO/Sara/MultiViewGeometry/DataTransformations.hpp>
+
+
+namespace DO::Sara {
+
+  auto populate_feature_gids(
+      const std::vector<KeypointList<OERegion, float>>& keypoints)
+      -> std::vector<FeatureGID>
+  {
+    const auto image_ids = range(static_cast<int>(keypoints.size()));
+
+    auto populate_gids = [&](auto image_id) {
+      const auto num_features =
+          static_cast<int>(features(keypoints[image_id]).size());
+      auto lids = range(num_features);
+      auto gids = std::vector<FeatureGID>(lids.size());
+      std::transform(std::begin(lids), std::end(lids), std::begin(gids),
+                     [&](auto lid) -> FeatureGID {
+                       return {image_id, lid};
+                     });
+      return gids;
+    };
+
+    const auto gids =
+        std::accumulate(std::begin(image_ids), std::end(image_ids),  //
+                        std::vector<FeatureGID>{},                   //
+                        [&](const auto& gids, const auto image_id) {
+                          auto gids_union = gids;
+                          ::append(gids_union, populate_gids(image_id));
+                          return gids_union;
+                        });
+
+    return gids;
+  }
+
+  auto calculate_feature_id_offsets(
+      const std::vector<KeypointList<OERegion, float>>& keypoints)
+      -> std::vector<int>
+  {
+    auto fid_offsets = std::vector<int>(keypoints.size(), 0);
+    std::transform(std::begin(keypoints), std::end(keypoints) - 1,
+                   std::begin(fid_offsets) + 1, [](const auto& keypoints) {
+                     return static_cast<int>(features(keypoints).size());
+                   });
+
+    std::partial_sum(std::begin(fid_offsets), std::end(fid_offsets),
+                     std::begin(fid_offsets));
+
+    return fid_offsets;
+  }
+
+  auto populate_feature_tracks(const ViewAttributes& view_attributes,
+                               const EpipolarEdgeAttributes& epipolar_edges)
+      -> std::pair<FeatureGraph, std::vector<std::vector<int>>>
+  {
+    const auto& keypoints = view_attributes.keypoints;
+
+    const auto gids = populate_feature_gids(keypoints);
+    const auto num_keypoints = gids.size();
+
+    // Populate the vertices.
+    const auto feature_ids = range(static_cast<int>(num_keypoints));
+    auto graph = FeatureGraph{num_keypoints};
+    // Fill the GID attribute for each vertex.
+    std::for_each(std::begin(feature_ids), std::end(feature_ids),
+                  [&](auto v) { graph[v] = gids[v]; });
+
+    const auto feature_id_offset = calculate_feature_id_offsets(keypoints);
+
+    // Incremental connected components.
+    using ICC = IncrementalConnectedComponentsHelper;
+    auto rank = ICC::initialize_ranks(graph);
+    auto parent = ICC::initialize_parents(graph);
+    auto ds = ICC::initialize_disjoint_sets(rank, parent);
+    ICC::initialize_incremental_components(graph, ds);
+
+    auto add_edge = [&](auto u, auto v) {
+      boost::add_edge(u, v, graph);
+      ds.union_set(u, v);
+    };
+
+    const auto& edge_ids = epipolar_edges.edge_ids;
+    const auto& edges = epipolar_edges.edges;
+    const auto& matches = epipolar_edges.matches;
+    const auto& E_inliers = epipolar_edges.E_inliers;
+    const auto& two_view_geometries = epipolar_edges.two_view_geometries;
+
+    // Populate the edges.
+    std::for_each(
+        std::begin(edge_ids), std::end(edge_ids), [&](const auto& ij) {
+          const auto& eij = edges[ij];
+          const auto i = eij.first;
+          const auto j = eij.second;
+          const auto& Mij = matches[ij];
+          const auto& inliers_ij = E_inliers[ij];
+          const auto& cheirality_ij = two_view_geometries[ij].cheirality;
+
+          std::cout << std::endl;
+          SARA_DEBUG << "Processing image pair " << i << " " << j << std::endl;
+
+          SARA_DEBUG << "Checking if there are inliers..." << std::endl;
+          SARA_CHECK(cheirality_ij.count());
+          SARA_CHECK(inliers_ij.flat_array().count());
+          if (inliers_ij.flat_array().count() == 0)
+            return;
+
+          SARA_DEBUG << "Calculating cheiral inliers..." << std::endl;
+          SARA_CHECK(cheirality_ij.size());
+          SARA_CHECK(inliers_ij.size());
+          static_assert(
+              std::is_same_v<decltype(inliers_ij.size()), std::size_t>);
+          if (static_cast<std::size_t>(cheirality_ij.size()) !=
+              inliers_ij.size())
+            throw std::runtime_error{
+                "cheirality_ij.size() != inliers_ij.size()"};
+
+          const Array<bool, 1, Dynamic> cheiral_inliers =
+              inliers_ij.row_vector().array() && cheirality_ij;
+          SARA_CHECK(cheiral_inliers.size());
+          SARA_CHECK(cheiral_inliers.count());
+
+          // Convert each match 'm' to a pair of point indices '(p, q)'.
+          SARA_DEBUG << "Transforming matches..." << std::endl;
+          const auto pq_tensor = to_tensor(Mij);
+          SARA_CHECK(Mij.size());
+          SARA_CHECK(pq_tensor.size(0));
+
+          if (pq_tensor.empty())
+            return;
+
+          SARA_DEBUG << "Updating disjoint sets..." << std::endl;
+          for (int m = 0; m < pq_tensor.size(0); ++m)
+          {
+            if (!cheiral_inliers(m))
+              continue;
+
+            const auto p = pq_tensor(m, 0);
+            const auto q = pq_tensor(m, 1);
+
+            const auto& p_off = feature_id_offset[i];
+            const auto& q_off = feature_id_offset[j];
+
+            const auto vp = p_off + p;
+            const auto vq = q_off + q;
+
+            // Runtime checks.
+            if (graph[vp].image_id != i)
+              throw std::runtime_error{"image_id[vp] != i"};
+            if (graph[vp].local_id != p)
+              throw std::runtime_error{"local_id[vp] != p"};
+
+            if (graph[vq].image_id != j)
+              throw std::runtime_error{"image_id[vq] != j"};
+            if (graph[vq].local_id != q)
+              throw std::runtime_error{"local_id[vq] != q"};
+
+            // Update the graph and the disjoint sets.
+            add_edge(vp, vq);
+          }
+        });
+
+    // Calculate the connected components.
+    auto components = std::vector<std::vector<int>>{};
+    {
+      const auto components_tmp = ICC::get_components(parent);
+      components.resize(components_tmp.size());
+      for (auto c : components_tmp)
+        for (auto [child, child_end] = components_tmp[c]; child != child_end;
+             ++child)
+          components[c].push_back(static_cast<int>(*child));
+    }
+
+
+    return {graph, components};
+  }
+
+  auto filter_feature_tracks(const FeatureGraph& graph,
+                             const std::vector<std::vector<int>>& components,
+                             ViewAttributes& views)
+      -> std::set<std::set<FeatureGID>>
+  {
+    // Populate the feature tracks regardless of their cardinality.
+    auto feature_tracks = std::set<std::set<FeatureGID>>{};
+    for (const auto& component : components)
+    {
+      auto feature_track = std::set<FeatureGID>{};
+      std::transform(component.begin(), component.end(),
+                     std::inserter(feature_track, std::begin(feature_track)),
+                     [&](const auto v) { return graph[v]; });
+
+      feature_tracks.insert(feature_track);
+    }
+
+    // Remove redundant features across images.
+    using ImageID = int;
+    using FeatureID = int;
+
+    auto filtered_feature_tracks_dict =
+        std::set<std::map<ImageID, FeatureID>>{};
+    for (const auto& track : feature_tracks)
+    {
+      auto filtered_track = std::map<ImageID, FeatureID>{};
+      for (const auto& f : track)
+      {
+        const auto current_feature = filtered_track.find(f.image_id);
+        const auto current_feature_found =
+            current_feature != filtered_track.end();
+
+        if (!current_feature_found)
+          filtered_track[f.image_id] = f.local_id;
+
+        // Replace the feature if the response is stronger.
+        else  // image_id == f.image_id
+        {
+          const auto& features_list = features(views.keypoints[f.image_id]);
+
+          // The feature local IDs.
+          const auto& current_feature_id = current_feature->second;
+          const auto current_feature_response =
+              std::abs(features_list[current_feature_id].extremum_value);
+
+          const auto feature_response =
+              std::abs(features_list[f.local_id].extremum_value);
+
+          if (feature_response > current_feature_response)
+            filtered_track[f.image_id] = f.local_id;
+        }
+      }
+
+      filtered_feature_tracks_dict.insert(filtered_track);
+    }
+
+    // Transform the filtered feature tracks again.
+    auto filtered_feature_tracks = std::set<std::set<FeatureGID>>{};
+    for (const auto& track_dict : filtered_feature_tracks_dict)
+    {
+      if (track_dict.size() == 1)
+        continue;
+
+      auto track_set = std::set<FeatureGID>{};
+      for (const auto& gid : track_dict)
+        track_set.insert({gid.first, gid.second});
+      filtered_feature_tracks.insert(track_set);
+    }
+
+    // Replace the feature tracks.
+    feature_tracks.swap(filtered_feature_tracks);
+
+    return feature_tracks;
+  }
+
+
+  template <>
+  struct CalculateH5Type<FeatureGID>
+  {
+    static inline auto value() -> H5::CompType
+    {
+      auto h5_comp_type = H5::CompType{sizeof(FeatureGID)};
+      INSERT_MEMBER(h5_comp_type, FeatureGID, image_id);
+      INSERT_MEMBER(h5_comp_type, FeatureGID, local_id);
+      return h5_comp_type;
+    }
+  };
+
+
+  auto write_feature_graph(const FeatureGraph& graph, H5File& file,
+                           const std::string& group_name) -> void
+  {
+    auto features = std::vector<FeatureGID>(boost::num_vertices(graph));
+    for (auto [v, v_end] = boost::vertices(graph); v != v_end; ++v)
+      features[*v] = {graph[*v].image_id, graph[*v].local_id};
+
+    auto matches = std::vector<Vector2i>{};
+    for (auto [e, e_end] = boost::edges(graph); e != e_end; ++e)
+      matches.push_back({boost::source(*e, graph), boost::target(*e, graph)});
+
+    file.get_group(group_name);
+    file.write_dataset(group_name + "/" + "features", tensor_view(features));
+    file.write_dataset(group_name + "/" + "matches", tensor_view(matches));
+  }
+
+
+  auto read_feature_graph(H5File& file, const std::string& group_name)
+      -> FeatureGraph
+  {
+    auto features = std::vector<FeatureGID>{};
+    auto matches = std::vector<Vector2i>{};
+
+    file.read_dataset(group_name + "/" + "features", features);
+    file.read_dataset(group_name + "/" + "matches", matches);
+
+    // Reconstruct the graph.
+    auto g = FeatureGraph{};
+
+    for (const auto& v : features)
+      boost::add_vertex(v, g);
+
+    for (const auto& e : matches)
+      boost::add_edge(e(0), e(1), g);
+
+    return g;
+  }
+
+} /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/FeatureGraph.hpp b/cpp/src/DO/Sara/MultiViewGeometry/Graph/FeatureGraph.hpp
similarity index 98%
rename from cpp/src/DO/Sara/MultiViewGeometry/FeatureGraph.hpp
rename to cpp/src/DO/Sara/MultiViewGeometry/Graph/FeatureGraph.hpp
index 264102d73..44bfae98d 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/FeatureGraph.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/Graph/FeatureGraph.hpp
@@ -15,7 +15,7 @@
 
 #include <DO/Sara/Core/HDF5.hpp>
 #include <DO/Sara/Features/KeypointList.hpp>
-#include <DO/Sara/MultiViewGeometry/EpipolarGraph.hpp>
+#include <DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.hpp>
 
 #include <boost/graph/adjacency_list.hpp>
 #include <boost/graph/connected_components.hpp>
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/HDF5.hpp b/cpp/src/DO/Sara/MultiViewGeometry/HDF5.hpp
index 35a3f54e8..d8a09f453 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/HDF5.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/HDF5.hpp
@@ -12,9 +12,9 @@
 #pragma once
 
 #include <DO/Sara/Core/HDF5.hpp>
-#include <DO/Sara/MultiViewGeometry/EpipolarGraph.hpp>
 #include <DO/Sara/MultiViewGeometry/Geometry/FundamentalMatrix.hpp>
 #include <DO/Sara/MultiViewGeometry/Geometry/PinholeCamera.hpp>
+#include <DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.hpp>
 
 
 namespace DO::Sara {
diff --git a/cpp/src/DO/Sara/SfM/Graph/BundleAdjuster.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/BundleAdjuster.hpp
similarity index 100%
rename from cpp/src/DO/Sara/SfM/Graph/BundleAdjuster.hpp
rename to cpp/src/DO/Sara/SfM/BuildingBlocks/BundleAdjuster.hpp
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.cpp
deleted file mode 100644
index 1e2ae7892..000000000
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.cpp
+++ /dev/null
@@ -1,345 +0,0 @@
-// ========================================================================== //
-// This file is part of Sara, a basic set of libraries in C++ for computer
-// vision.
-//
-// Copyright (C) 2019 David Ok <david.ok8@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License v. 2.0. If a copy of the MPL was not distributed with this file,
-// you can obtain one at http://mozilla.org/MPL/2.0/.
-// ========================================================================== //
-
-#include <DO/Sara/Core/StringFormat.hpp>
-#include <DO/Sara/FileSystem.hpp>
-#include <DO/Sara/RANSAC/RANSAC.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.hpp>
-
-#include <boost/filesystem.hpp>
-
-
-namespace fs = boost::filesystem;
-
-
-namespace DO::Sara {
-
-  using ESolver = NisterFivePointAlgorithm;
-
-  auto estimate_essential_matrix(const std::vector<Match>& Mij,            //
-                                 const KeypointList<OERegion, float>& ki,  //
-                                 const KeypointList<OERegion, float>& kj,  //
-                                 const Eigen::Matrix3d& Ki_inv,            //
-                                 const Eigen::Matrix3d& Kj_inv,            //
-                                 int num_samples,                          //
-                                 double err_thres)
-      -> std::tuple<EssentialMatrix, Tensor_<bool, 1>, Tensor_<int, 1>>
-  {
-    const auto& fi = features(ki);
-    const auto& fj = features(kj);
-    const auto ui = extract_centers(fi).cast<double>();
-    const auto uj = extract_centers(fj).cast<double>();
-
-    const auto uni = apply_transform(Ki_inv, homogeneous(ui));
-    const auto unj = apply_transform(Kj_inv, homogeneous(uj));
-
-    const auto Mij_tensor = to_tensor(Mij);
-    const auto Xij = PointCorrespondenceList{Mij_tensor, uni, unj};
-
-    auto inlier_predicate = InlierPredicate<AlgebraicEpipolarDistance>{};
-    inlier_predicate.err_threshold = err_thres;
-
-    const auto [E, inliers, sample_best] =
-        ransac(Xij, ESolver{}, inlier_predicate, num_samples);
-
-    SARA_CHECK(E);
-    SARA_CHECK(inliers.row_vector());
-    SARA_CHECK(Mij.size());
-
-    return std::make_tuple(E, inliers, sample_best);
-  }
-
-  auto estimate_essential_matrices(const std::string& dirpath,      //
-                                   const std::string& h5_filepath,  //
-                                   int num_samples,                 //
-                                   double noise,                    //
-                                   int min_F_inliers,               //
-                                   bool overwrite,                  //
-                                   bool debug,                      //
-                                   bool wait_key) -> void
-  {
-    // Create a backup.
-    if (!fs::exists(h5_filepath + ".bak"))
-      cp(h5_filepath, h5_filepath + ".bak");
-
-    SARA_DEBUG << "Opening file " << h5_filepath << "..." << std::endl;
-    auto h5_file = H5File{h5_filepath, H5F_ACC_RDWR};
-
-    auto view_attributes = ViewAttributes{};
-
-    // Load images (optional).
-    SARA_DEBUG << "Listing images from:\n\t" << dirpath << std::endl;
-    view_attributes.list_images(dirpath);
-    if (debug)
-      view_attributes.read_images();
-
-    // Load the internal camera matrices from Strecha dataset.
-    // N.B.: this is an ad-hoc code.
-    SARA_DEBUG << "Reading internal camera matrices in Strecha's data format"
-               << std::endl;
-    std::for_each(std::begin(view_attributes.image_paths),
-                  std::end(view_attributes.image_paths),
-                  [&](const auto& image_path) {
-                    const auto K_filepath =
-                        dirpath + "/" + basename(image_path) + ".png.K";
-                    SARA_DEBUG << "Reading internal camera matrix from:\n\t"
-                               << K_filepath << std::endl;
-                    view_attributes.cameras.push_back(normalized_camera());
-                    view_attributes.cameras.back().K =
-                        read_internal_camera_parameters(K_filepath);
-                  });
-
-    // Load keypoints.
-    SARA_DEBUG << "Reading keypoints from HDF5 file:\n\t" << h5_filepath
-               << std::endl;
-    view_attributes.read_keypoints(h5_file);
-    const auto& keypoints = view_attributes.keypoints;
-
-    // Initialize the epipolar graph.
-    const auto num_vertices = int(view_attributes.image_paths.size());
-    SARA_CHECK(num_vertices);
-
-    auto edge_attributes = EpipolarEdgeAttributes{};
-    SARA_DEBUG << "Initializing the epipolar edges..." << std::endl;
-    edge_attributes.initialize_edges(num_vertices);
-
-    SARA_DEBUG << "Reading matches from HDF5 file:\n\t" << h5_filepath
-               << std::endl;
-    edge_attributes.read_matches(h5_file, view_attributes);
-
-    SARA_DEBUG << "Reading the fundamental matrices..." << std::endl;
-    edge_attributes.resize_fundamental_edge_list();
-    edge_attributes.read_fundamental_matrices(view_attributes, h5_file);
-    // TODO: we will use the meta data later to decide if we want to estimate an
-    // essential matrix because it is a lot slower than the fundamental
-    // matrix estimation.
-
-    SARA_DEBUG << "Preallocate the E data structures..." << std::endl;
-    edge_attributes.resize_essential_edge_list();
-
-    const auto& edge_ids = edge_attributes.edge_ids;
-    const auto& edges = edge_attributes.edges;
-    const auto& matches = edge_attributes.matches;
-    SARA_CHECK(edge_ids.size());
-    SARA_CHECK(edges.size());
-    SARA_CHECK(matches.size());
-
-    // Mutate these.
-    auto& E = edge_attributes.E;
-    auto& E_num_samples = edge_attributes.E_num_samples;
-    auto& E_noise = edge_attributes.E_noise;
-    auto& E_inliers = edge_attributes.E_inliers;
-    auto& E_best_samples = edge_attributes.E_best_samples;
-
-    // Use this data to decide if we want to estimate an essential matrix.
-    const auto& F_inliers = edge_attributes.F_inliers;
-    auto F_num_inliers = [&](const auto& ij) {
-      return F_inliers[ij].vector().count();
-    };
-    // auto F_inlier_ratio = [&](const auto& ij) {
-    //   return double(F_num_inliers(ij)) / F_inliers[ij].size();
-    // };
-
-    std::for_each(
-        std::begin(edge_ids), std::end(edge_ids), [&](const auto& ij) {
-          const auto& eij = edges[ij];
-          const auto i = eij.first;
-          const auto j = eij.second;
-          const auto& Mij = matches[ij];
-          const auto& ki = keypoints[i];
-          const auto& kj = keypoints[j];
-          const auto& Ki = view_attributes.cameras[i].K;
-          const auto& Kj = view_attributes.cameras[j].K;
-          const auto Ki_inv = Ki.inverse();
-          const auto Kj_inv = Kj.inverse();
-
-          SARA_DEBUG << "Calculating essential matrices between images:\n"
-                     << "- image[" << i << "] = "  //
-                     << view_attributes.group_names[i] << "\n"
-                     << "- image[" << j << "] = "  //
-                     << view_attributes.group_names[j] << "\n";
-
-          SARA_DEBUG << "Internal camera matrices :\n"
-                     << "- K[" << i << "] =\n"
-                     << Ki << "\n"
-                     << "- K[" << j << "] =\n"
-                     << Kj << "\n";
-          std::cout.flush();
-
-          auto Eij = EssentialMatrix{};
-          auto E_best_sample_ij = Tensor_<int, 1>{ESolver::num_points};
-          auto E_inliers_ij = Tensor_<bool, 1>{static_cast<int>(Mij.size())};
-          auto Fij = FundamentalMatrix{};
-          if (F_num_inliers(ij) >= min_F_inliers)
-          {
-            // Estimate the fundamental matrix.
-            std::tie(Eij, E_inliers_ij, E_best_sample_ij) =
-                estimate_essential_matrix(Mij, ki, kj, Ki_inv, Kj_inv,
-                                          num_samples, noise);
-
-            Eij.matrix() = Eij.matrix().normalized();
-
-            Fij.matrix() =
-                Kj.inverse().transpose() * Eij.matrix() * Ki.inverse();
-
-            SARA_DEBUG << "Eij = \n" << Eij << std::endl;
-            SARA_DEBUG << "Fij = \n" << Fij << std::endl;
-            SARA_CHECK(E_inliers_ij.row_vector());
-            SARA_CHECK(E_best_sample_ij.row_vector());
-          }
-          else
-          {
-            Eij.matrix().setZero();
-            E_best_sample_ij.flat_array().setZero();
-            E_inliers_ij.flat_array().setZero();
-          }
-
-          if (debug)
-          {
-            const int display_step = 20;
-            const auto& Ii = view_attributes.images[i];
-            const auto& Ij = view_attributes.images[j];
-            check_epipolar_constraints(Ii, Ij, Fij, Mij, E_best_sample_ij,
-                                       E_inliers_ij, display_step, wait_key);
-          }
-
-          // Update.
-          E[ij] = Eij;
-          E_inliers[ij] = E_inliers_ij;
-          E_best_samples[ij] = E_best_sample_ij;
-          // Useful if we use MLESAC and variants.
-          E_noise[ij] = noise;
-          // Useful if we use PROSAC sampling strategy.
-          E_num_samples[ij] = num_samples;
-
-          SARA_CHECK(E_num_samples[ij]);
-          SARA_CHECK(E_noise[ij]);
-        });
-
-    h5_file.write_dataset("E", tensor_view(E), overwrite);
-    h5_file.write_dataset("E_num_samples", tensor_view(E_num_samples),
-                          overwrite);
-    h5_file.write_dataset("E_noise", tensor_view(E_noise), overwrite);
-    h5_file.write_dataset("E_best_samples", E_best_samples, overwrite);
-    // Save E-edges.
-    h5_file.get_group("E_inliers");
-    std::for_each(std::begin(edge_ids), std::end(edge_ids),
-                  [&](const auto& ij) {
-                    const auto i = edges[ij].first;
-                    const auto j = edges[ij].second;
-                    h5_file.write_dataset(format("E_inliers/%d_%d", i, j),
-                                          E_inliers[ij], overwrite);
-                  });
-  }
-
-  auto inspect_essential_matrices(const std::string& dirpath,
-                                  const std::string& h5_filepath,
-                                  int display_step, bool wait_key) -> void
-  {
-    SARA_DEBUG << "Opening file " << h5_filepath << "..." << std::endl;
-    auto h5_file = H5File{h5_filepath, H5F_ACC_RDONLY};
-
-    auto view_attributes = ViewAttributes{};
-
-    // Load images (optional).
-    SARA_DEBUG << "Listing images from:\n\t" << dirpath << std::endl;
-    view_attributes.list_images(dirpath);
-    view_attributes.read_images();
-
-    // Load keypoints.
-    SARA_DEBUG << "Reading keypoints from HDF5 file:\n\t" << h5_filepath
-               << std::endl;
-    view_attributes.read_keypoints(h5_file);
-    const auto& images = view_attributes.images;
-
-    // Load the internal camera matrices from Strecha dataset.
-    // N.B.: this is an ad-hoc code.
-    SARA_DEBUG << "Reading internal camera matrices in Strecha's data format"
-               << std::endl;
-    std::for_each(std::begin(view_attributes.image_paths),
-                  std::end(view_attributes.image_paths),
-                  [&](const auto& image_path) {
-                    const auto K_filepath =
-                        dirpath + "/" + basename(image_path) + ".png.K";
-                    SARA_DEBUG << "Reading internal camera matrix from:\n\t"
-                               << K_filepath << std::endl;
-                    view_attributes.cameras.push_back(normalized_camera());
-                    view_attributes.cameras.back().K =
-                        read_internal_camera_parameters(K_filepath);
-                  });
-
-    // Initialize the epipolar graph.
-    const auto num_vertices = int(view_attributes.image_paths.size());
-    SARA_CHECK(num_vertices);
-
-    auto edge_attributes = EpipolarEdgeAttributes{};
-    SARA_DEBUG << "Initializing the epipolar edges..." << std::endl;
-    edge_attributes.initialize_edges(num_vertices);
-
-    SARA_DEBUG << "Reading matches from HDF5 file:\n\t" << h5_filepath
-               << std::endl;
-    edge_attributes.read_matches(h5_file, view_attributes);
-
-    SARA_DEBUG << "Reading the essential matrices..." << std::endl;
-    edge_attributes.resize_essential_edge_list();
-    edge_attributes.read_essential_matrices(view_attributes, h5_file);
-
-    // Convenient references.
-    const auto& edge_ids = edge_attributes.edge_ids;
-    const auto& edges = edge_attributes.edges;
-    const auto& matches = edge_attributes.matches;
-
-    const auto& E = edge_attributes.E;
-    const auto& E_num_samples = edge_attributes.E_num_samples;
-    const auto& E_noise = edge_attributes.E_noise;
-    const auto& E_best_samples = edge_attributes.E_best_samples;
-    const auto& E_inliers = edge_attributes.E_inliers;
-
-    std::for_each(
-        std::begin(edge_ids), std::end(edge_ids), [&](const auto& ij) {
-          const auto& eij = edges[ij];
-          const auto i = eij.first;
-          const auto j = eij.second;
-          const auto& Mij = matches[ij];
-
-          const auto& Eij = E[ij];
-
-          const auto& Ki = view_attributes.cameras[i].K;
-          const auto& Kj = view_attributes.cameras[j].K;
-
-          SARA_DEBUG << "Internal camera matrices :\n"
-                     << "- K[" << i << "] =\n"
-                     << Ki << "\n"
-                     << "- K[" << j << "] =\n"
-                     << Kj << "\n";
-
-          SARA_DEBUG
-              << "Forming the fundamental matrix from the essential matrix:\n";
-          std::cout.flush();
-          auto Fij = FundamentalMatrix{};
-          Fij.matrix() = Kj.inverse().transpose() * Eij.matrix() * Ki.inverse();
-
-          SARA_DEBUG << "Fij = \n" << E[ij] << std::endl;
-          SARA_CHECK(E_num_samples[ij]);
-          SARA_CHECK(E_noise[ij]);
-          SARA_CHECK(E_inliers[ij].row_vector());
-          SARA_CHECK(E_inliers[ij].row_vector().count());
-          SARA_CHECK(E_best_samples[ij].row_vector());
-
-          const auto& Ii = images[i];
-          const auto& Ij = images[j];
-          check_epipolar_constraints(Ii, Ij, Fij, Mij, E_best_samples[ij],
-                                     E_inliers[ij], display_step, wait_key);
-        });
-  }
-
-} /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/FeatureParams.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/FeatureParams.hpp
new file mode 100644
index 000000000..7ee194712
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/FeatureParams.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <DO/Sara/ImageProcessing/ImagePyramid.hpp>
+
+
+namespace DO::Sara {
+
+  struct FeatureParams
+  {
+    ImagePyramidParams image_pyr_params = ImagePyramidParams(0);
+    float sift_nn_ratio = 0.6f;
+    std::size_t num_matches_max = 1000u;
+    std::size_t num_inliers_min = 100u;
+  };
+
+}  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.cpp
deleted file mode 100644
index a799231ed..000000000
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.cpp
+++ /dev/null
@@ -1,312 +0,0 @@
-// ========================================================================== //
-// This file is part of Sara, a basic set of libraries in C++ for computer
-// vision.
-//
-// Copyright (C) 2019 David Ok <david.ok8@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License v. 2.0. If a copy of the MPL was not distributed with this file,
-// you can obtain one at http://mozilla.org/MPL/2.0/.
-// ========================================================================== //
-
-#include <DO/Sara/Core/HDF5.hpp>
-#include <DO/Sara/Core/StringFormat.hpp>
-#include <DO/Sara/FileSystem.hpp>
-#include <DO/Sara/MultiViewGeometry.hpp>
-#include <DO/Sara/RANSAC/RANSACv2.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.hpp>
-#include <DO/Sara/Visualization.hpp>
-
-#include <boost/filesystem.hpp>
-
-
-namespace fs = boost::filesystem;
-
-
-namespace DO::Sara {
-
-  using FSolver = SevenPointAlgorithmDoublePrecision;
-
-  auto estimate_fundamental_matrix(const std::vector<Match>& Mij,
-                                   const KeypointList<OERegion, float>& ki,
-                                   const KeypointList<OERegion, float>& kj,
-                                   const int num_samples,
-                                   const double err_thres)
-      -> std::tuple<FundamentalMatrix, Tensor_<bool, 1>, Tensor_<int, 1>>
-  {
-    const auto& fi = features(ki);
-    const auto& fj = features(kj);
-    const auto pi = homogeneous(extract_centers(fi).cast<double>());
-    const auto pj = homogeneous(extract_centers(fj).cast<double>());
-    const auto Mij_tensor = to_tensor(Mij);
-
-    const auto Xij = PointCorrespondenceList{Mij_tensor, pi, pj};
-    const auto data_normalizer =
-        std::make_optional(Normalizer<FundamentalMatrix>{Xij});
-
-    auto inlier_predicate = InlierPredicate<SampsonEpipolarDistance>{};
-    inlier_predicate.err_threshold = err_thres;
-
-    static constexpr auto confidence = 0.99;
-    const auto [F, inliers, sample_best] = v2::ransac(  //
-        Xij,                                            //
-        FSolver{},                                      //
-        inlier_predicate,                               //
-        num_samples, confidence,                        //
-        data_normalizer,                                //
-        true);
-
-    return std::make_tuple(F, inliers, sample_best);
-  }
-
-  auto estimate_fundamental_matrices(const std::string& dirpath,
-                                     const std::string& h5_filepath,
-                                     bool overwrite, bool debug, bool wait_key)
-      -> void
-  {
-    // Create a backup.
-    if (!fs::exists(h5_filepath + ".bak"))
-      cp(h5_filepath, h5_filepath + ".bak");
-
-    SARA_DEBUG << "Opening file " << h5_filepath << "..." << std::endl;
-    auto h5_file = H5File{h5_filepath, H5F_ACC_RDWR};
-
-    auto view_attributes = ViewAttributes{};
-
-    // Load images (optional).
-    SARA_DEBUG << "Listing images from:\n\t" << dirpath << std::endl;
-    view_attributes.list_images(dirpath);
-    if (debug)
-      view_attributes.read_images();
-
-    // Load keypoints (optional).
-    SARA_DEBUG << "Reading keypoints from HDF5 file:\n\t" << h5_filepath
-               << std::endl;
-    view_attributes.read_keypoints(h5_file);
-    const auto& keypoints = view_attributes.keypoints;
-
-    // Initialize the epipolar graph.
-    const auto num_vertices = int(view_attributes.image_paths.size());
-    SARA_CHECK(num_vertices);
-
-    auto edge_attributes = EpipolarEdgeAttributes{};
-    SARA_DEBUG << "Initializing the epipolar edges..." << std::endl;
-    edge_attributes.initialize_edges(num_vertices);
-
-    SARA_DEBUG << "Reading matches from HDF5 file:\n\t" << h5_filepath
-               << std::endl;
-    edge_attributes.read_matches(h5_file, view_attributes);
-
-    SARA_DEBUG << "Preallocate the F data structures..." << std::endl;
-    edge_attributes.resize_fundamental_edge_list();
-
-    const auto& edge_ids = edge_attributes.edge_ids;
-    const auto& edges = edge_attributes.edges;
-    const auto& matches = edge_attributes.matches;
-    SARA_CHECK(edge_ids.size());
-    SARA_CHECK(edges.size());
-    SARA_CHECK(matches.size());
-
-
-    // Mutate these.
-    auto& F = edge_attributes.F;
-    auto& F_num_samples = edge_attributes.F_num_samples;
-    auto& F_noise = edge_attributes.F_noise;
-    auto& F_inliers = edge_attributes.F_inliers;
-    auto& F_best_samples = edge_attributes.F_best_samples;
-
-    const auto num_samples = 1000;
-    const auto f_err_thres = 5e-3;
-    std::for_each(
-        std::begin(edge_ids), std::end(edge_ids), [&](const auto& ij) {
-          const auto& eij = edges[ij];
-          const auto i = eij.first;
-          const auto j = eij.second;
-          const auto& Mij = matches[ij];
-          const auto& ki = keypoints[i];
-          const auto& kj = keypoints[j];
-
-          SARA_DEBUG << "Calculating fundamental matrices between images:\n"
-                     << "- image[" << i << "] = "  //
-                     << view_attributes.group_names[i] << "\n"
-                     << "- image[" << j << "] = "  //
-                     << view_attributes.group_names[j] << "\n";
-          std::cout.flush();
-
-          // Estimate the fundamental matrix.
-          const auto [Fij, F_inliers_ij, F_best_sample_ij] =
-              estimate_fundamental_matrix(Mij, ki, kj, num_samples,
-                                          f_err_thres);
-          SARA_DEBUG << "Fij = \n" << Fij << std::endl;
-          SARA_CHECK(F_inliers_ij.row_vector());
-          SARA_CHECK(F_best_sample_ij.row_vector());
-
-          if (debug)
-          {
-            const int display_step = 20;
-            const auto& Ii = view_attributes.images[i];
-            const auto& Ij = view_attributes.images[j];
-            check_epipolar_constraints(Ii, Ij, Fij, Mij, F_best_sample_ij,
-                                       F_inliers_ij, display_step, wait_key);
-          }
-
-          // Update.
-          F[ij] = Fij;
-          F_inliers[ij] = F_inliers_ij;
-          F_best_samples[ij] = F_best_sample_ij;
-          F_noise[ij] = f_err_thres;
-        });
-
-    // Save fundamental matrices and additional info from RANSAC.
-    h5_file.write_dataset("F", tensor_view(F), overwrite);
-    h5_file.write_dataset("F_num_samples", tensor_view(F_num_samples),
-                          overwrite);
-    h5_file.write_dataset("F_noise", tensor_view(F_noise), overwrite);
-    h5_file.write_dataset("F_best_samples", F_best_samples, overwrite);
-
-    h5_file.get_group("F_inliers");
-    std::for_each(std::begin(edge_ids), std::end(edge_ids),
-                  [&](const auto& ij) {
-                    const auto i = edges[ij].first;
-                    const auto j = edges[ij].second;
-                    h5_file.write_dataset(format("F_inliers/%d_%d", i, j),
-                                          F_inliers[ij], overwrite);
-                  });
-  }
-
-  auto check_epipolar_constraints(const Image<Rgb8>& Ii, const Image<Rgb8>& Ij,
-                                  const FundamentalMatrix& F,
-                                  const std::vector<Match>& Mij,
-                                  const TensorView_<int, 1>& sample_best,
-                                  const TensorView_<bool, 1>& inliers,
-                                  int display_step, bool wait_key) -> void
-  {
-    const auto scale = 0.25f;
-    const auto w = int((Ii.width() + Ij.width()) * scale + 0.5f);
-    const auto h = int(std::max(Ii.height(), Ij.height()) * scale + 0.5f);
-
-    if (!active_window())
-    {
-      create_window(w, h);
-      set_antialiasing();
-    }
-
-    if (get_sizes(active_window()) != Eigen::Vector2i(w, h))
-      resize_window(w, h);
-
-    PairWiseDrawer drawer(Ii, Ij);
-    drawer.set_viz_params(scale, scale, PairWiseDrawer::CatH);
-
-    drawer.display_images();
-
-    for (auto m = 0; m < static_cast<int>(Mij.size()); ++m)
-    {
-      const Eigen::Vector3d X1 = Mij[m].x_pos().cast<double>().homogeneous();
-      const Eigen::Vector3d X2 = Mij[m].y_pos().cast<double>().homogeneous();
-
-      if (!inliers(m))
-        continue;
-
-      if (m % display_step == 0)
-      {
-        drawer.draw_match(Mij[m], Blue8, false);
-
-        const auto proj_X1 = F.right_epipolar_line(X1);
-        const auto proj_X2 = F.left_epipolar_line(X2);
-
-        drawer.draw_line_from_eqn(0, proj_X2.cast<float>(), Cyan8, 1);
-        drawer.draw_line_from_eqn(1, proj_X1.cast<float>(), Cyan8, 1);
-      }
-    }
-
-    for (auto m = 0; m < static_cast<int>(sample_best.size()); ++m)
-    {
-      // Draw the best elemental subset drawn by RANSAC.
-      drawer.draw_match(Mij[sample_best(m)], Red8, true);
-
-      const Eigen::Vector3d X1 =
-          Mij[sample_best(m)].x_pos().cast<double>().homogeneous();
-      const Eigen::Vector3d X2 =
-          Mij[sample_best(m)].y_pos().cast<double>().homogeneous();
-
-      const auto proj_X1 = F.right_epipolar_line(X1);
-      const auto proj_X2 = F.left_epipolar_line(X2);
-
-      // Draw the corresponding epipolar lines.
-      drawer.draw_line_from_eqn(1, proj_X1.cast<float>(), Magenta8, 1);
-      drawer.draw_line_from_eqn(0, proj_X2.cast<float>(), Magenta8, 1);
-    }
-
-    if (wait_key)
-      get_key();
-  }
-
-  auto inspect_fundamental_matrices(const std::string& dirpath,
-                                    const std::string& h5_filepath,
-                                    int display_step, bool wait_key) -> void
-  {
-    SARA_DEBUG << "Opening file " << h5_filepath << "..." << std::endl;
-    auto h5_file = H5File{h5_filepath, H5F_ACC_RDONLY};
-
-    auto view_attributes = ViewAttributes{};
-
-    // Load images (optional).
-    SARA_DEBUG << "Listing images from:\n\t" << dirpath << std::endl;
-    view_attributes.list_images(dirpath);
-    view_attributes.read_images();
-
-    // Load keypoints (optional).
-    SARA_DEBUG << "Reading keypoints from HDF5 file:\n\t" << h5_filepath
-               << std::endl;
-    view_attributes.read_keypoints(h5_file);
-    const auto& images = view_attributes.images;
-
-    // Initialize the epipolar graph.
-    const auto num_vertices = int(view_attributes.image_paths.size());
-    SARA_CHECK(num_vertices);
-
-    auto edge_attributes = EpipolarEdgeAttributes{};
-    SARA_DEBUG << "Initializing the epipolar edges..." << std::endl;
-    edge_attributes.initialize_edges(num_vertices);
-
-    SARA_DEBUG << "Reading matches from HDF5 file:\n\t" << h5_filepath
-               << std::endl;
-    edge_attributes.read_matches(h5_file, view_attributes);
-
-    SARA_DEBUG << "Reading the fundamental matrices..." << std::endl;
-    edge_attributes.resize_fundamental_edge_list();
-    edge_attributes.read_fundamental_matrices(view_attributes, h5_file);
-
-    // Convenient references.
-    const auto& edge_ids = edge_attributes.edge_ids;
-    const auto& edges = edge_attributes.edges;
-    const auto& matches = edge_attributes.matches;
-
-    const auto& F = edge_attributes.F;
-    const auto& F_num_samples = edge_attributes.F_num_samples;
-    const auto& F_noise = edge_attributes.F_noise;
-    const auto& F_best_samples = edge_attributes.F_best_samples;
-    const auto& F_inliers = edge_attributes.F_inliers;
-
-    std::for_each(
-        std::begin(edge_ids), std::end(edge_ids), [&](const auto& ij) {
-          const auto& eij = edges[ij];
-          const auto i = eij.first;
-          const auto j = eij.second;
-          const auto& Mij = matches[ij];
-
-          SARA_DEBUG << "Fij = \n" << F[ij] << std::endl;
-          SARA_CHECK(F_num_samples[ij]);
-          SARA_CHECK(F_noise[ij]);
-          SARA_CHECK(F_inliers[ij].row_vector());
-          SARA_CHECK(F_inliers[ij].row_vector().count());
-          SARA_CHECK(F_best_samples[ij].row_vector());
-
-          const auto& Ii = images[i];
-          const auto& Ij = images[j];
-          check_epipolar_constraints(Ii, Ij, F[ij], Mij, F_best_samples[ij],
-                                     F_inliers[ij], display_step, wait_key);
-        });
-  }
-
-} /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointDetection.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointDetection.cpp
deleted file mode 100644
index 3bf50c42b..000000000
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointDetection.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-// ========================================================================== //
-// This file is part of Sara, a basic set of libraries in C++ for computer
-// vision.
-//
-// Copyright (C) 2019 David Ok <david.ok8@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License v. 2.0. If a copy of the MPL was not distributed with this file,
-// you can obtain one at http://mozilla.org/MPL/2.0/.
-// ========================================================================== //
-
-#include <DO/Sara/SfM/BuildingBlocks/KeypointDetection.hpp>
-
-#include <DO/Sara/FeatureDetectors.hpp>
-#include <DO/Sara/FileSystem.hpp>
-#include <DO/Sara/ImageIO.hpp>
-#include <DO/Sara/Visualization.hpp>
-
-
-namespace DO::Sara {
-
-  auto detect_keypoints(const std::string& dirpath,
-                        const std::string& h5_filepath,  //
-                        bool overwrite) -> void
-  {
-    auto h5_file = H5File{h5_filepath, H5F_ACC_TRUNC};
-
-    auto image_paths = std::vector<std::string>{};
-    append(image_paths, ls(dirpath, ".png"));
-    append(image_paths, ls(dirpath, ".jpg"));
-
-    std::for_each(
-        std::begin(image_paths), std::end(image_paths), [&](const auto& path) {
-          SARA_DEBUG << "Reading image " << path << "..." << std::endl;
-          const auto image = imread<float>(path);
-
-          SARA_DEBUG << "Computing SIFT keypoints " << path << "..."
-                     << std::endl;
-          const auto keys = compute_sift_keypoints(image);
-
-          const auto group_name = basename(path);
-          h5_file.get_group(group_name);
-
-          SARA_DEBUG << "Saving SIFT keypoints of " << path << "..."
-                     << std::endl;
-          write_keypoints(h5_file, group_name, keys, overwrite);
-        });
-  }
-
-
-  auto read_keypoints(const std::string& dirpath,
-                      const std::string& h5_filepath) -> void
-  {
-    auto h5_file = H5File{h5_filepath, H5F_ACC_RDONLY};
-    auto image_paths = std::vector<std::string>{};
-    append(image_paths, ls(dirpath, ".png"));
-    append(image_paths, ls(dirpath, ".jpg"));
-
-    std::for_each(
-        std::begin(image_paths), std::end(image_paths), [&](const auto& path) {
-          SARA_DEBUG << "Reading image " << path << "..." << std::endl;
-          const auto image = imread<float>(path);
-
-          const auto group_name = basename(path);
-
-          SARA_DEBUG << "Read keypoints for " << group_name << "..."
-                     << std::endl;
-          const auto keys = read_keypoints(h5_file, group_name);
-
-          const auto& features = std::get<0>(keys);
-
-          // Visual inspection.
-          if (!active_window())
-          {
-            create_window(image.sizes() / 2, group_name);
-            set_antialiasing();
-          }
-
-          if (get_sizes(active_window()) != image.sizes() / 2)
-            resize_window(image.sizes() / 2);
-
-          display(image, Point2i::Zero(), 0.5);
-          draw_oe_regions(features, Red8, 0.5f);
-          get_key();
-        });
-
-    if (active_window())
-      close_window();
-  }
-
-} /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointMatching.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointMatching.cpp
deleted file mode 100644
index bc08bc368..000000000
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointMatching.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-// ========================================================================== //
-// This file is part of Sara, a basic set of libraries in C++ for computer
-// vision.
-//
-// Copyright (C) 2019 David Ok <david.ok8@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License v. 2.0. If a copy of the MPL was not distributed with this file,
-// you can obtain one at http://mozilla.org/MPL/2.0/.
-// ========================================================================== //
-
-#include <DO/Sara/FeatureMatching.hpp>
-#include <DO/Sara/FileSystem.hpp>
-#include <DO/Sara/Match.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
-
-#include <boost/filesystem.hpp>
-
-
-namespace fs = boost::filesystem;
-
-
-namespace DO::Sara {
-
-auto match(const KeypointList<OERegion, float>& keys1,
-           const KeypointList<OERegion, float>& keys2,
-           float lowe_ratio)
-    -> std::vector<Match>
-{
-  AnnMatcher matcher{keys1, keys2, lowe_ratio};
-  return matcher.compute_matches();
-}
-
-
-auto match_keypoints(const std::string& dirpath, const std::string& h5_filepath,
-                     bool overwrite) -> void
-{
-  // Create a backup.
-  if (!fs::exists(h5_filepath + ".bak"))
-    cp(h5_filepath, h5_filepath + ".bak");
-
-  auto h5_file = H5File{h5_filepath, H5F_ACC_RDWR};
-
-  auto image_paths = std::vector<std::string>{};
-  append(image_paths, ls(dirpath, ".png"));
-  append(image_paths, ls(dirpath, ".jpg"));
-  std::sort(image_paths.begin(), image_paths.end());
-
-  auto group_names = std::vector<std::string>{};
-  group_names.reserve(image_paths.size());
-  std::transform(std::begin(image_paths), std::end(image_paths),
-                 std::back_inserter(group_names),
-                 [&](const std::string& image_path) {
-                   return basename(image_path);
-                 });
-
-  auto keypoints = std::vector<KeypointList<OERegion, float>>{};
-  keypoints.reserve(image_paths.size());
-  std::transform(std::begin(group_names), std::end(group_names),
-                 std::back_inserter(keypoints),
-                 [&](const std::string& group_name) {
-                   return read_keypoints(h5_file, group_name);
-                 });
-
-  const auto N = int(image_paths.size());
-  auto edges = std::vector<std::pair<int, int>>{};
-  edges.reserve(N * (N - 1) / 2);
-  for (int i = 0; i < N; ++i)
-    for (int j = i + 1; j < N; ++j)
-      edges.emplace_back(i, j);
-
-  auto matches = std::vector<std::vector<Match>>{};
-  matches.reserve(edges.size());
-  std::transform(std::begin(edges), std::end(edges),
-                 std::back_inserter(matches),
-                 [&](const auto& edge) {
-                   const auto i = edge.first;
-                   const auto j = edge.second;
-                   return match(keypoints[i], keypoints[j]);
-                 });
-
-  // Save matches to HDF5.
-  auto edge_ids = range(edges.size());
-  std::for_each(
-      std::begin(edge_ids), std::end(edge_ids), [&](const auto& e) {
-        const auto& ij = edges[e];
-        const auto i = ij.first;
-        const auto j = ij.second;
-        const auto& matches_ij = matches[e];
-
-        // Transform the data.
-        auto Mij = std::vector<IndexMatch>{};
-        std::transform(
-            std::begin(matches_ij), std::end(matches_ij),
-            std::back_inserter(Mij), [](const auto& m) {
-              return IndexMatch{m.x_index(), m.y_index(), m.score()};
-            });
-
-        // Save the keypoints to HDF5
-        const auto group_name = std::string{"matches"};
-        h5_file.get_group(group_name);
-
-        const auto match_dataset =
-            group_name + "/" + std::to_string(i) + "_" + std::to_string(j);
-        h5_file.write_dataset(match_dataset, tensor_view(Mij), overwrite);
-      });
-}
-
-} /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/Graph/PointCloudManipulator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp
similarity index 95%
rename from cpp/src/DO/Sara/SfM/Graph/PointCloudManipulator.hpp
rename to cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp
index 69e91dcd6..139dece09 100644
--- a/cpp/src/DO/Sara/SfM/Graph/PointCloudManipulator.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
+#include <DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp>
 #include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
 #include <DO/Sara/SfM/Graph/FeatureGraph.hpp>
-#include <DO/Sara/SfM/Graph/RgbColoredPoint.hpp>
 
 
 namespace DO::Sara {
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.cpp
new file mode 100644
index 000000000..4c7be664e
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.cpp
@@ -0,0 +1,59 @@
+#include <DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp>
+
+#include <DO/Sara/Logging/Logger.hpp>
+
+#include <DO/Sara/MultiViewGeometry/DataTransformations.hpp>
+#include <DO/Sara/MultiViewGeometry/MinimalSolvers/InlierPredicates.hpp>
+#include <DO/Sara/MultiViewGeometry/MinimalSolvers/RelativePoseSolver.hpp>
+
+using namespace DO::Sara::v2;
+
+RelativePoseEstimator::RelativePoseEstimator(
+    const v2::BrownConradyDistortionModel<double>& camera)
+{
+  configure(camera);
+}
+
+auto RelativePoseEstimator::configure(
+    const v2::BrownConradyDistortionModel<double>& camera) -> void
+{
+  _K = camera.calibration_matrix();
+  _K_inv = _K.inverse();
+
+  _inlier_predicate.distance.K1_inv = _K_inv;
+  _inlier_predicate.distance.K2_inv = _K_inv;
+  _inlier_predicate.err_threshold = err_thres;
+}
+
+auto RelativePoseEstimator::estimate_relative_pose(
+    const KeypointList<OERegion, float>& src_keys,
+    const KeypointList<OERegion, float>& dst_keys,
+    std::vector<Match>& matches) const
+    -> std::tuple<TwoViewGeometry, Tensor_<bool, 1>, Tensor_<int, 1>>
+{
+  auto logger = Logger::get();
+
+  SARA_LOGD(logger, "Estimating the relative pose...");
+  if (matches.empty())
+  {
+    SARA_LOGD(logger, "Skipping relative pose estimation");
+    return {};
+  }
+
+  const auto& f0 = features(src_keys);
+  const auto& f1 = features(dst_keys);
+  const auto u = std::array{
+      homogeneous(extract_centers(f0)).cast<double>(),
+      homogeneous(extract_centers(f1)).cast<double>()  //
+  };
+  // List the matches as a 2D-tensor where each row encodes a match 'm' as a
+  // pair of point indices (i, j).
+  const auto M = to_tensor(matches);
+
+  const auto X = PointCorrespondenceList{M, u[0], u[1]};
+  auto data_normalizer =
+      std::make_optional(Normalizer<TwoViewGeometry>{_K, _K});
+
+  return v2::ransac(X, _solver, _inlier_predicate, ransac_iterations_max,
+                    ransac_confidence, data_normalizer, true);
+}
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp
new file mode 100644
index 000000000..914ad0b55
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <DO/Sara/Features/KeypointList.hpp>
+
+#include <DO/Sara/MultiViewGeometry/Camera/v2/BrownConradyCamera.hpp>
+#include <DO/Sara/MultiViewGeometry/DataTransformations.hpp>
+#include <DO/Sara/MultiViewGeometry/MinimalSolvers/InlierPredicates.hpp>
+#include <DO/Sara/MultiViewGeometry/MinimalSolvers/RelativePoseSolver.hpp>
+
+#include <DO/Sara/RANSAC/RANSACv2.hpp>
+
+
+namespace DO::Sara::v2 {
+
+  struct RelativePoseEstimator
+  {
+    int ransac_iterations_max = 1000;
+    double ransac_confidence = 0.999;
+    double err_thres = 4.;
+
+    // Use Stewenius' algorithm instead of Nister's for now. The polynomial
+    // solver must have some convergence problems.
+    const RelativePoseSolver<SteweniusFivePointAlgorithm> _solver;
+    CheiralAndEpipolarConsistency _inlier_predicate;
+
+    Eigen::Matrix3d _K;
+    Eigen::Matrix3d _K_inv;
+
+    RelativePoseEstimator(
+        const v2::BrownConradyDistortionModel<double>& camera);
+
+    auto configure(const v2::BrownConradyDistortionModel<double>& camera)
+        -> void;
+
+    auto estimate_relative_pose(const KeypointList<OERegion, float>& src_keys,
+                                const KeypointList<OERegion, float>& dst_keys,
+                                std::vector<Match>& matches) const
+        -> std::tuple<TwoViewGeometry, Tensor_<bool, 1>, Tensor_<int, 1>>;
+  };
+
+}  // namespace DO::Sara::v2
diff --git a/cpp/src/DO/Sara/SfM/Graph/RgbColoredPoint.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp
similarity index 100%
rename from cpp/src/DO/Sara/SfM/Graph/RgbColoredPoint.hpp
rename to cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/FeatureTracker.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/FeatureTracker.hpp
deleted file mode 100644
index fdac7975e..000000000
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/FeatureTracker.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include <DO/Sara/Core/Image.hpp>
-#include <DO/Sara/Features/Feature.hpp>
-
-#include <DO/Sara/FeatureDetectors/SIFT.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
-
-
-namespace DO::Sara::v2 {
-
-  struct FeatureTracker
-  {
-    ImagePyramidParams image_pyr_params = ImagePyramidParams(0);
-    float sift_nn_ratio = 0.6f;
-    std::size_t num_matches_max = 1000u;
-
-    auto detect_features(const ImageView<float>& image,
-                         KeypointList<OERegion, float>& keypoints) const -> void
-    {
-      keypoints = compute_sift_keypoints(image, image_pyr_params);
-    }
-
-    auto match_features(const KeypointList<OERegion, float>& src_keys,
-                        const KeypointList<OERegion, float>& dst_keys) const
-        -> std::vector<Match>
-    {
-      if (features(src_keys).empty() || features(dst_keys).empty())
-        return {};
-
-      auto matches = match(src_keys, dst_keys, sift_nn_ratio);
-      if (matches.size() > num_matches_max)
-        matches.resize(num_matches_max);
-
-      return matches;
-    }
-  };
-
-}  // namespace DO::Sara::v2
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/RelativePoseEstimator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/RelativePoseEstimator.hpp
deleted file mode 100644
index ff3829a3a..000000000
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/v2/RelativePoseEstimator.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-#pragma once
-
-#include <DO/Sara/FeatureDetectors/SIFT.hpp>
-#include <DO/Sara/Features/KeypointList.hpp>
-
-#include <DO/Sara/MultiViewGeometry/Camera/v2/BrownConradyCamera.hpp>
-#include <DO/Sara/MultiViewGeometry/DataTransformations.hpp>
-#include <DO/Sara/MultiViewGeometry/MinimalSolvers/ErrorMeasures.hpp>
-#include <DO/Sara/MultiViewGeometry/MinimalSolvers/InlierPredicates.hpp>
-#include <DO/Sara/MultiViewGeometry/MinimalSolvers/RelativePoseSolver.hpp>
-
-#include <DO/Sara/RANSAC/RANSACv2.hpp>
-
-#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
-
-
-namespace DO::Sara::v2 {
-
-  struct RelativePoseEstimator
-  {
-    int ransac_iterations_max = 1000;
-    double ransac_confidence = 0.999;
-    double err_thres = 4.;
-
-    // Use Stewenius' algorithm instead of Nister's for now. The polynomial
-    // solver must have some convergence problems.
-    const RelativePoseSolver<SteweniusFivePointAlgorithm> _solver;
-    CheiralAndEpipolarConsistency _inlier_predicate;
-
-    Eigen::Matrix3d _K;
-    Eigen::Matrix3d _K_inv;
-
-    RelativePoseEstimator(const v2::BrownConradyDistortionModel<double>& camera)
-    {
-      configure(camera);
-    }
-
-    auto configure(const v2::BrownConradyDistortionModel<double>& camera)
-        -> void
-    {
-      _K = camera.calibration_matrix();
-      _K_inv = _K.inverse();
-
-      _inlier_predicate.distance.K1_inv = _K_inv;
-      _inlier_predicate.distance.K2_inv = _K_inv;
-      _inlier_predicate.err_threshold = err_thres;
-    }
-
-    auto estimate_relative_pose(const KeypointList<OERegion, float>& src_keys,
-                                const KeypointList<OERegion, float>& dst_keys,
-                                std::vector<Match>& matches) const
-        -> std::tuple<TwoViewGeometry, Tensor_<bool, 1>, Tensor_<int, 1>>
-    {
-      print_stage("Estimating the relative pose...");
-      if (matches.empty())
-      {
-        SARA_DEBUG << "Skipping relative pose estimation\n";
-        return {};
-      }
-
-      const auto& f0 = features(src_keys);
-      const auto& f1 = features(dst_keys);
-      const auto u = std::array{
-          homogeneous(extract_centers(f0)).cast<double>(),
-          homogeneous(extract_centers(f1)).cast<double>()  //
-      };
-      // List the matches as a 2D-tensor where each row encodes a match 'm' as a
-      // pair of point indices (i, j).
-      const auto M = to_tensor(matches);
-
-      const auto X = PointCorrespondenceList{M, u[0], u[1]};
-      auto data_normalizer =
-          std::make_optional(Normalizer<TwoViewGeometry>{_K, _K});
-
-      return v2::ransac(X, _solver, _inlier_predicate, ransac_iterations_max,
-                        ransac_confidence, data_normalizer, true);
-    }
-  };
-
-}  // namespace DO::Sara::v2
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
index a77edd418..0a551fd31 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
@@ -9,41 +9,42 @@
 // you can obtain one at http://mozilla.org/MPL/2.0/.
 // ========================================================================== //
 
-
 #include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
 
 #include <DO/Sara/FeatureDetectors/SIFT.hpp>
 #include <DO/Sara/Features/KeypointList.hpp>
 #include <DO/Sara/Logging/Logger.hpp>
+#include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
 
 
 using namespace DO::Sara;
 
-auto CameraPoseGraph::detect_keypoints(
-    const v2::FeatureTracker& feature_tracker,
-    const ImageView<float>& image,  //
-    const int frame_index) -> void
 
+auto CameraPoseGraph::add_absolute_pose(
+    KeypointList<OERegion, float>&& keypoints,  //
+    const int image_id) -> CameraPoseGraph::Vertex
 {
   auto& logger = Logger::get();
 
-  SARA_LOGI(logger, "Detecting keypoints for image frame {}", frame_index);
+  SARA_LOGI(logger, "Detecting keypoints for image frame {}", image_id);
 
   // Grow the pose graph by creating a new camera vertex.
   const auto v = boost::add_vertex(_g);
 
   // Store the camera pose data.
-  auto& camera_pose_data = _g[v];
-  camera_pose_data.frame_index = frame_index;
-  camera_pose_data.keypoints = compute_sift_keypoints(image);
+  auto& pose_data = _g[v];
+  pose_data.image_id = image_id;
+  pose_data.keypoints = std::move(keypoints);
+
+  const auto& f = features(pose_data.keypoints);
+  SARA_LOGI(logger, "Camera {}: {} keypoints", v, f.size());
 
-  const auto& f = features(camera_pose_data.keypoints);
-  SARA_LOGI(logger, "Camera vertex: {} keypoints", f.size());
+  return v;
 }
 
-auto CameraPoseGraph::estimate_relative_motion(
-    const v2::FeatureTracker& feature_tracker,                 //
+auto CameraPoseGraph::add_relative_pose(
     const v2::RelativePoseEstimator& relative_pose_estimator,  //
+    const FeatureParams& feature_params,                       //
     const Vertex u, const Vertex v) -> void
 {
   auto& logger = Logger::get();
@@ -51,9 +52,14 @@ auto CameraPoseGraph::estimate_relative_motion(
   SARA_LOGI(logger, "Match features...");
   const auto& src_keys = _g[u].keypoints;
   const auto& dst_keys = _g[v].keypoints;
-  auto matches = feature_tracker.match_features(src_keys, dst_keys);
+  if (features(src_keys).empty() || features(dst_keys).empty())
+    return;
+
+  auto matches = match(src_keys, dst_keys, feature_params.sift_nn_ratio);
   if (matches.empty())
     return;
+  if (matches.size() > feature_params.num_matches_max)
+    matches.resize(feature_params.num_matches_max);
 
   SARA_LOGI(logger, "Estimating relative pose...");
   auto [geometry, inliers, sample_best] =
@@ -71,7 +77,5 @@ auto CameraPoseGraph::estimate_relative_motion(
     auto& relative_motion_data = _g[e];
     relative_motion_data.matches = std::move(matches);
     relative_motion_data.inliers = std::move(inliers);
-    relative_motion_data.src_camera = u;
-    relative_motion_data.dst_camera = v;
   }
 }
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
index aa7b4f99b..701ef40a5 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
@@ -17,63 +17,87 @@
 #include <DO/Sara/MultiViewGeometry/Geometry/EssentialMatrix.hpp>
 #include <DO/Sara/MultiViewGeometry/Geometry/QuaternionBasedPose.hpp>
 
-#include <DO/Sara/SfM/BuildingBlocks/v2/FeatureTracker.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/v2/RelativePoseEstimator.hpp>
+#include <DO/Sara/SfM/BuildingBlocks/FeatureParams.hpp>
+#include <DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp>
 
 #include <boost/graph/adjacency_list.hpp>
 
 
 namespace DO::Sara {
 
-  struct CameraPoseData
+  struct AbsolutePoseData
   {
-    //! @brief The corresponding image frame index.
-    int frame_index;
-
+    //! @brief The corresponding image frame index or image ID.
+    int image_id;
     //! @brief The keypoints detected in the image.
     KeypointList<OERegion, float> keypoints;
-
     //! @brief "Absolute" pose w.r.t. some reference frame.
     QuaternionBasedPose<double> pose;
   };
 
-  struct RelativeMotionData
+  struct RelativePoseData
   {
-    using camera_id_t = int;
-    static constexpr auto undefined_camera_id = -1;
-
-    camera_id_t src_camera = undefined_camera_id;
-    camera_id_t dst_camera = undefined_camera_id;
-
     std::vector<Match> matches;
     Tensor_<bool, 1> inliers;
-
     Motion motion;
   };
 
   class CameraPoseGraph
   {
   public:
-    using GraphImpl =
-        boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS,
-                              CameraPoseData, RelativeMotionData>;
-
-  public:
-    using Vertex = boost::graph_traits<GraphImpl>::vertex_descriptor;
-    using Edge = boost::graph_traits<GraphImpl>::edge_descriptor;
-
-    auto detect_keypoints(const v2::FeatureTracker& feature_tracker,
-                          const ImageView<float>& image,  //
-                          const int frame_index) -> void;
-
-    auto estimate_relative_motion(
-        const v2::FeatureTracker& feature_tracker,                 //
+    using Impl = boost::adjacency_list<                //
+        boost::vecS, boost::vecS, boost::undirectedS,  //
+        AbsolutePoseData, RelativePoseData>;
+    using Vertex = boost::graph_traits<Impl>::vertex_descriptor;
+    using VertexIndex = boost::graph_traits<Impl>::vertices_size_type;
+    using Edge = boost::graph_traits<Impl>::edge_descriptor;
+
+    operator Impl&()
+    {
+      return _g;
+    }
+
+    operator const Impl&() const
+    {
+      return _g;
+    }
+
+    auto operator[](const Vertex u) -> AbsolutePoseData&
+    {
+      return _g[u];
+    }
+
+    auto operator[](const Vertex u) const -> const AbsolutePoseData&
+    {
+      return _g[u];
+    }
+
+    auto operator[](const Edge e) -> RelativePoseData&
+    {
+      return _g[e];
+    }
+
+    auto operator[](const Edge e) const -> const RelativePoseData&
+    {
+      return _g[e];
+    }
+
+    auto num_vertices() const -> VertexIndex
+    {
+      return boost::num_vertices(_g);
+    }
+
+    auto add_absolute_pose(KeypointList<OERegion, float>&& keypoints,
+                           const int image_id) -> Vertex;
+
+    auto add_relative_pose(
         const v2::RelativePoseEstimator& relative_pose_estimator,  //
+        const FeatureParams& feature_params,                       //
         const Vertex src, const Vertex dst) -> void;
 
   private:
     //! @brief The graph data structure shortened as g.
-    GraphImpl _g;
+    Impl _g;
   };
 
 } /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureDisjointSets.hpp b/cpp/src/DO/Sara/SfM/Graph/FeatureDisjointSets.hpp
new file mode 100644
index 000000000..998b2a209
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/Graph/FeatureDisjointSets.hpp
@@ -0,0 +1,57 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024-present David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#pragma once
+
+#include <DO/Sara/SfM/Graph/FeatureGraph.hpp>
+
+#include <boost/graph/graph_utility.hpp>
+#include <boost/graph/incremental_components.hpp>
+#include <boost/pending/disjoint_sets.hpp>
+
+
+namespace DO::Sara {
+
+  //! @brief Feature disjoint sets for feature tracking.
+  struct FeatureDisjointSets
+  {
+    using Rank = FeatureGraph::Vertex*;
+    using Parent = FeatureGraph::Vertex*;
+    using Components = boost::component_index<FeatureGraph::VertexIndex>;
+
+    FeatureDisjointSets() = default;
+
+    explicit FeatureDisjointSets(const FeatureGraph& graph)
+    {
+      const FeatureGraph::Impl& g = graph;
+
+      const auto n = graph.num_vertices();
+      _rank.resize(n);
+      _parent.resize(n);
+      _ds.reset(new boost::disjoint_sets<Rank, Parent>(&_rank[0], &_parent[0]));
+      if (_ds.get() == nullptr)
+        throw std::runtime_error{
+            "Failed to allocate and initialize the feature disjoint sets"};
+      boost::initialize_incremental_components(g, *_ds);
+      boost::incremental_components(g, *_ds);
+    }
+
+    auto components() const -> Components
+    {
+      return {_parent.begin(), _parent.end()};
+    }
+
+    std::vector<FeatureGraph::VertexIndex> _rank;
+    std::vector<FeatureGraph::Vertex> _parent;
+    std::unique_ptr<boost::disjoint_sets<Rank, Parent>> _ds;
+  };
+
+}  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureGID.hpp b/cpp/src/DO/Sara/SfM/Graph/FeatureGID.hpp
deleted file mode 100644
index afd29e130..000000000
--- a/cpp/src/DO/Sara/SfM/Graph/FeatureGID.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// ========================================================================== //
-// This file is part of Sara, a basic set of libraries in C++ for computer
-// vision.
-//
-// Copyright (C) 2023-present David Ok <david.ok8@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License v. 2.0. If a copy of the MPL was not distributed with this file,
-// you can obtain one at http://mozilla.org/MPL/2.0/.
-// ========================================================================== //
-
-#pragma once
-
-#include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
-
-#include <utility>
-
-
-namespace DO::Sara {
-
-  //! @brief Feature Global ID (GID).
-  struct FeatureGID
-  {
-    CameraPoseGraph::Vertex pose_vertex;
-    std::size_t feature_index;
-
-    auto operator==(const FeatureGID& other) const -> bool
-    {
-      return pose_vertex == other.pose_vertex &&
-             feature_index == other.feature_index;
-    }
-
-    auto operator<(const FeatureGID& other) const -> bool
-    {
-      return (pose_vertex < other.pose_vertex) ||
-             (pose_vertex == other.pose_vertex &&
-              feature_index < other.feature_index);
-    }
-  };
-
-}  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.cpp b/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.cpp
index 9315d363b..b53d35fe0 100644
--- a/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.cpp
+++ b/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.cpp
@@ -12,8 +12,8 @@
 #include <DO/Sara/SfM/Graph/FeatureGraph.hpp>
 
 #include <boost/graph/adjacency_list.hpp>
-#include <boost/pending/disjoint_sets.hpp>
 #include <boost/graph/incremental_components.hpp>
+#include <boost/pending/disjoint_sets.hpp>
 
 
 using namespace DO::Sara;
@@ -22,13 +22,9 @@ using namespace DO::Sara;
 auto FeatureGraph::calculate_feature_tracks() const
     -> std::vector<FeatureGraph::Track>
 {
-  using VertexIndex = boost::graph_traits<FeatureGraph::GraphImpl>::vertices_size_type;
-
   using Rank = VertexIndex*;
   using Parent = Vertex*;
 
-  // using DisjointSets = boost::disjoint_sets<Rank, Parent>;
-
   using Components = boost::component_index<VertexIndex>;
 
   const auto num_vertices = boost::num_vertices(_feature_graph);
diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp b/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp
index 06ef1b36a..c5dd624dd 100644
--- a/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp
@@ -12,22 +12,74 @@
 #pragma once
 
 #include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
-#include <DO/Sara/SfM/Graph/FeatureGID.hpp>
+#include <boost/graph/detail/adjacency_list.hpp>
 
 
 namespace DO::Sara {
 
-  class FeatureGraph
+  //! @brief Feature Global ID (GID).
+  struct FeatureGID
   {
-    using GraphImpl = boost::adjacency_list<           //
-        boost::vecS, boost::vecS, boost::undirectedS,  //
-        FeatureGID>;
+    CameraPoseGraph::Vertex pose_vertex;
+    int feature_index;
+
+    auto operator==(const FeatureGID& other) const -> bool
+    {
+      return pose_vertex == other.pose_vertex &&
+             feature_index == other.feature_index;
+    }
+
+    auto operator<(const FeatureGID& other) const -> bool
+    {
+      return (pose_vertex < other.pose_vertex) ||
+             (pose_vertex == other.pose_vertex &&
+              feature_index < other.feature_index);
+    }
+  };
+
+  //! @brief Match global ID (GID).
+  struct MatchGID
+  {
+    //! @brief Index of the epipolar edge connecting camera i and camera j.
+    CameraPoseGraph::Vertex i;
+    CameraPoseGraph::Vertex j;
+    //! @brief Local match index.
+    std::size_t index;
+
+    auto operator==(const MatchGID& other) const -> bool
+    {
+      return i == other.i && j == other.j && index == other.index;
+    }
+
+    auto operator<(const MatchGID& other) const -> bool
+    {
+      return (i < other.i) || (i == other.i && j < other.j) ||
+             (i == other.i && j == other.j && index < other.index);
+    }
+  };
 
+  //! @brief Feature Graph.
+  class FeatureGraph
+  {
   public:
-    using Vertex = boost::graph_traits<GraphImpl>::vertex_descriptor;
-    using Edge = boost::graph_traits<GraphImpl>::edge_descriptor;
+    using Impl = boost::adjacency_list<                //
+        boost::vecS, boost::vecS, boost::undirectedS,  //
+        FeatureGID, MatchGID>;
+    using Vertex = boost::graph_traits<Impl>::vertex_descriptor;
+    using VertexIndex = boost::graph_traits<Impl>::vertices_size_type;
+    using Edge = boost::graph_traits<Impl>::edge_descriptor;
     using Track = std::vector<Vertex>;
 
+    operator Impl&()
+    {
+      return _feature_graph;
+    }
+
+    operator const Impl&() const
+    {
+      return _feature_graph;
+    }
+
     auto operator[](Vertex v) -> FeatureGID&
     {
       return _feature_graph[v];
@@ -38,6 +90,11 @@ namespace DO::Sara {
       return _feature_graph[v];
     }
 
+    auto num_vertices() const -> VertexIndex
+    {
+      return boost::num_vertices(_feature_graph);
+    }
+
     auto calculate_feature_tracks() const -> std::vector<Track>;
     auto filter_by_non_max_suppression(const Track&,
                                        const CameraPoseGraph&) const -> Track;
@@ -46,7 +103,7 @@ namespace DO::Sara {
         -> Vertex;
 
   private:
-    GraphImpl _feature_graph;
+    Impl _feature_graph;
   };
 
 } /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp b/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp
new file mode 100644
index 000000000..497627f90
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp
@@ -0,0 +1,189 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024-present David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#include <DO/Sara/SfM/Graph/FeatureTracker.hpp>
+
+#include <DO/Sara/Logging/Logger.hpp>
+
+#include <boost/foreach.hpp>
+
+
+using namespace DO::Sara;
+
+auto FeatureTracker::update_feature_tracks(
+    const CameraPoseGraph& camera_pose_graph,
+    const CameraPoseGraph::Edge relative_pose_edge) -> void
+{
+  auto& logger = Logger::get();
+
+  const CameraPoseGraph::Impl& cg = camera_pose_graph;
+  FeatureGraph::Impl& fg = _feature_graph;
+
+  // Retrieve the two camera vertices from the relative pose edge.
+  const auto pose_i = boost::source(relative_pose_edge, cg);
+  const auto pose_j = boost::target(relative_pose_edge, cg);
+  // The relative pose edge contains the set of all feature correspondences.
+  const auto& matches = cg[relative_pose_edge].matches;
+  // Which of these feature correspondences are marked as inliers?
+  const auto& inliers = cg[relative_pose_edge].inliers;
+
+  // Loop over the feature correspondence and add the feature graph edges.
+  SARA_LOGD(logger, "Pose {} <-> Pose {}", pose_i, pose_j);
+  SARA_LOGD(logger, "Add feature correspondences...");
+  for (auto m = 0u; m < matches.size(); ++m)
+  {
+    if (!inliers(m))
+      continue;
+
+    const auto& match = matches[m];
+
+    // Local feature indices.
+    const auto& f1 = match.x_index();
+    const auto& f2 = match.y_index();
+
+    // Create their corresponding feature GIDs.
+    const auto gid1 = FeatureGID{
+        .pose_vertex = pose_i,  //
+        .feature_index = f1     //
+    };
+    const auto gid2 = FeatureGID{
+        .pose_vertex = pose_j,  //
+        .feature_index = f2     //
+    };
+
+    // Locate their corresponding pair of vertices (u, v) in the graph?
+    // Do they exist yet in the first place?
+    const auto u_it = _feature_vertex.find(gid1);
+    const auto v_it = _feature_vertex.find(gid2);
+
+    const auto u_does_not_exist_yet = u_it == _feature_vertex.end();
+    const auto v_does_not_exist_yet = v_it == _feature_vertex.end();
+
+    // If not, add them if necessary.
+    const auto u = u_does_not_exist_yet ? boost::add_vertex(fg) : u_it->second;
+    const auto v = v_does_not_exist_yet ? boost::add_vertex(fg) : v_it->second;
+
+    if (u_does_not_exist_yet)
+    {
+      fg[u] = gid1;
+      _feature_vertex[gid1] = u;
+    }
+    if (v_does_not_exist_yet)
+    {
+      fg[v] = gid2;
+      _feature_vertex[gid2] = v;
+    }
+
+    // Finally, store the feature match as an edge in the feature graph.
+    const auto [uv, uv_added] = boost::add_edge(u, v, fg);
+    auto& uv_attrs = fg[uv];
+    uv_attrs.i = boost::source(relative_pose_edge, cg);
+    uv_attrs.j = boost::target(relative_pose_edge, cg);
+    uv_attrs.index = m;
+  }
+
+  // Update the feature disjoint-sets
+  SARA_LOGD(logger, "[Feature-Tracks] Recalculating connected components...");
+  const auto _feature_ds = FeatureDisjointSets{_feature_graph};
+  const auto feature_components = _feature_ds.components();
+  SARA_LOGD(logger, "[Feature-Tracks] num feature components = {}",
+            feature_components.size());
+
+  // Update the list of feature tracks.
+  _feature_tracks.clear();
+  _feature_tracks.reserve(feature_components.size());
+
+  BOOST_FOREACH (FeatureGraph::VertexIndex current_index, feature_components)
+  {
+    // Iterate through the child vertex indices for [current_index]
+    auto component_size = 0;
+    BOOST_FOREACH (FeatureGraph::VertexIndex child_index,
+                   feature_components[current_index])
+    {
+      (void) child_index;
+      ++component_size;
+    }
+
+    if (component_size == 1)
+      continue;
+
+    auto track = std::vector<FeatureGraph::VertexIndex>{};
+    track.reserve(component_size);
+    BOOST_FOREACH (FeatureGraph::VertexIndex child_index,
+                   feature_components[current_index])
+      track.push_back(child_index);
+
+    _feature_tracks.emplace_back(std::move(track));
+  }
+
+  SARA_LOGD(logger, "[Feature-Tracks] num feature tracks = {}",
+            _feature_tracks.size());
+}
+
+auto FeatureTracker::calculate_alive_feature_tracks(
+    const CameraPoseGraph::Vertex camera_vertex_curr) const
+    -> std::tuple<TrackArray, TrackVisibilityCountArray>
+{
+  auto& logger = Logger::get();
+
+  // Find the feature tracks that are still alive.
+  const FeatureGraph::Impl& fgraph = _feature_graph;
+
+  const auto& ftracks = _feature_tracks;
+  auto tracks_alive = TrackArray{};
+  auto track_visibility_count = TrackVisibilityCountArray{};
+
+  for (const auto& ftrack : ftracks)
+  {
+    // Do we still see the track in the image.
+    const auto is_alive =
+        std::find_if(ftrack.begin(), ftrack.end(),
+                     [&fgraph, camera_vertex_curr](const auto& v) {
+                       return fgraph[v].pose_vertex == camera_vertex_curr;
+                     }) != ftrack.end();
+
+    if (!is_alive)
+      continue;
+
+    // Add the newly found alive track.
+    tracks_alive.push_back(ftrack);
+
+    // Carefully count the track life, it's not the number of vertices, but
+    // the number of camera views in which the feature reappears.
+    auto camera_vertices_where_present =
+        std::unordered_set<CameraPoseGraph::Vertex>{};
+    std::transform(ftrack.begin(), ftrack.end(),
+                   std::inserter(camera_vertices_where_present,
+                                 camera_vertices_where_present.end()),
+                   [&fgraph](const auto& v) { return fgraph[v].pose_vertex; });
+    track_visibility_count.push_back(camera_vertices_where_present.size());
+  }
+  SARA_LOGD(logger, "Num tracks alive: {}", tracks_alive.size());
+
+  const auto longest_track_alive = std::max_element(
+      track_visibility_count.begin(), track_visibility_count.end());
+  if (longest_track_alive != track_visibility_count.end())
+  {
+    SARA_LOGD(logger, "Longest track life: {}", *longest_track_alive);
+#if 0
+       const auto longest_track_index =
+           longest_track_alive - track_visibility_count.begin();
+       const auto& longest_track = tracks_alive[longest_track_index];
+       for (const auto& v : longest_track)
+         std::cout << fmt::format("(cam:{},ind:{})", fgraph[v].camera_vertex,
+                                  fgraph[v].index)
+                   << " ";
+       std::cout << std::endl;
+#endif
+  }
+
+  return std::make_tuple(tracks_alive, track_visibility_count);
+}
diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.hpp b/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.hpp
new file mode 100644
index 000000000..45323fb72
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.hpp
@@ -0,0 +1,53 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024-present David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#pragma once
+
+#include <DO/Sara/SfM/Graph/FeatureDisjointSets.hpp>
+
+#include <map>
+
+
+namespace DO::Sara {
+
+  //! @brief Feature tracker
+  struct FeatureTracker
+  {
+    using Track = std::vector<FeatureGraph::VertexIndex>;
+    using TrackArray = std::vector<Track>;
+    using TrackVisibilityCountArray = std::vector<std::size_t>;
+
+    //! @brief The graph of 2D image features
+    //!
+    //! Image features are connected if there exists a relative pose that
+    //! explains it.
+    FeatureGraph _feature_graph;
+    FeatureDisjointSets _feature_ds;
+    std::vector<Track> _feature_tracks;
+
+    //! @brief Retrieve the feature vertex from its pair (camera pose vertex,
+    //! keypoint index).
+    std::map<FeatureGID, FeatureGraph::Vertex> _feature_vertex;
+
+    //! @brief Retrieve the feature edge from its pair (camera pose edge,
+    //! match index).
+    std::map<MatchGID, FeatureGraph::Edge> _feature_match;
+
+    auto update_feature_tracks(  //
+        const CameraPoseGraph& camera_pose_graph,
+        const CameraPoseGraph::Edge relative_pose_edge_id) -> void;
+
+    auto calculate_alive_feature_tracks(
+        const CameraPoseGraph::Vertex camera_vertex_curr) const
+        -> std::tuple<TrackArray, TrackVisibilityCountArray>;
+  };
+
+}  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks.hpp b/cpp/src/DO/Sara/SfM/Helpers.hpp
similarity index 62%
rename from cpp/src/DO/Sara/SfM/BuildingBlocks.hpp
rename to cpp/src/DO/Sara/SfM/Helpers.hpp
index fece2c527..c2134ef25 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks.hpp
+++ b/cpp/src/DO/Sara/SfM/Helpers.hpp
@@ -11,8 +11,7 @@
 
 #pragma once
 
-#include <DO/Sara/SfM/BuildingBlocks/KeypointDetection.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/Triangulation.hpp>
+#include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
+#include <DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.hpp>
+#include <DO/Sara/SfM/Helpers/EssentialMatrixEstimation.hpp>
+#include <DO/Sara/SfM/Helpers/Triangulation.hpp>
diff --git a/cpp/src/DO/Sara/SfM/Helpers/EssentialMatrixEstimation.cpp b/cpp/src/DO/Sara/SfM/Helpers/EssentialMatrixEstimation.cpp
new file mode 100644
index 000000000..af744f710
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/Helpers/EssentialMatrixEstimation.cpp
@@ -0,0 +1,55 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2019 David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#include <DO/Sara/SfM/Helpers/EssentialMatrixEstimation.hpp>
+
+#include <DO/Sara/RANSAC/RANSAC.hpp>
+#include <DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.hpp>
+
+
+namespace DO::Sara {
+
+  using ESolver = NisterFivePointAlgorithm;
+
+  auto estimate_essential_matrix(const std::vector<Match>& Mij,            //
+                                 const KeypointList<OERegion, float>& ki,  //
+                                 const KeypointList<OERegion, float>& kj,  //
+                                 const Eigen::Matrix3d& Ki_inv,            //
+                                 const Eigen::Matrix3d& Kj_inv,            //
+                                 int num_samples,                          //
+                                 double err_thres)
+      -> std::tuple<EssentialMatrix, Tensor_<bool, 1>, Tensor_<int, 1>>
+  {
+    const auto& fi = features(ki);
+    const auto& fj = features(kj);
+    const auto ui = extract_centers(fi).cast<double>();
+    const auto uj = extract_centers(fj).cast<double>();
+
+    const auto uni = apply_transform(Ki_inv, homogeneous(ui));
+    const auto unj = apply_transform(Kj_inv, homogeneous(uj));
+
+    const auto Mij_tensor = to_tensor(Mij);
+    const auto Xij = PointCorrespondenceList{Mij_tensor, uni, unj};
+
+    auto inlier_predicate = InlierPredicate<AlgebraicEpipolarDistance>{};
+    inlier_predicate.err_threshold = err_thres;
+
+    const auto [E, inliers, sample_best] =
+        ransac(Xij, ESolver{}, inlier_predicate, num_samples);
+
+    SARA_CHECK(E);
+    SARA_CHECK(inliers.row_vector());
+    SARA_CHECK(Mij.size());
+
+    return std::make_tuple(E, inliers, sample_best);
+  }
+
+} /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.hpp b/cpp/src/DO/Sara/SfM/Helpers/EssentialMatrixEstimation.hpp
similarity index 59%
rename from cpp/src/DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.hpp
rename to cpp/src/DO/Sara/SfM/Helpers/EssentialMatrixEstimation.hpp
index ed9bdf87c..6961f6cbc 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/EssentialMatrixEstimation.hpp
+++ b/cpp/src/DO/Sara/SfM/Helpers/EssentialMatrixEstimation.hpp
@@ -11,7 +11,6 @@
 
 #pragma once
 
-#include <DO/Sara/Defines.hpp>
 #include <DO/Sara/MultiViewGeometry.hpp>
 
 
@@ -20,8 +19,7 @@ namespace DO::Sara {
   //! @addtogroup SfM
   //! @{
 
-  //! @{
-  //! @brief Essential matrix estimation.
+  //! @brief Helper to estimate the essential matrix.
   auto estimate_essential_matrix(const std::vector<Match>& Mij,
                                  const KeypointList<OERegion, float>& ki,
                                  const KeypointList<OERegion, float>& kj,
@@ -30,18 +28,6 @@ namespace DO::Sara {
                                  double err_thres)
       -> std::tuple<EssentialMatrix, Tensor_<bool, 1>, Tensor_<int, 1>>;
 
-  auto estimate_essential_matrices(const std::string& dirpath,      //
-                                   const std::string& h5_filepath,  //
-                                   int num_samples,                 //
-                                   double noise,                    //
-                                   int min_F_inliers,               //
-                                   bool overwrite, bool debug,
-                                   bool wait_key = false) -> void;
-
-  auto inspect_essential_matrices(const std::string& dirpath,
-                                  const std::string& h5_filepath,
-                                  int display_step, bool wait_key) -> void;
-  //! @}
 
   //! @}
 
diff --git a/cpp/src/DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.cpp b/cpp/src/DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.cpp
new file mode 100644
index 000000000..21c84aa59
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.cpp
@@ -0,0 +1,122 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2019 David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#include <DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.hpp>
+
+#include <DO/Sara/MultiViewGeometry.hpp>
+#include <DO/Sara/RANSAC/RANSACv2.hpp>
+#include <DO/Sara/Visualization.hpp>
+
+
+namespace DO::Sara {
+
+  using FSolver = SevenPointAlgorithmDoublePrecision;
+
+  auto estimate_fundamental_matrix(const std::vector<Match>& Mij,
+                                   const KeypointList<OERegion, float>& ki,
+                                   const KeypointList<OERegion, float>& kj,
+                                   const int num_samples,
+                                   const double err_thres)
+      -> std::tuple<FundamentalMatrix, Tensor_<bool, 1>, Tensor_<int, 1>>
+  {
+    const auto& fi = features(ki);
+    const auto& fj = features(kj);
+    const auto pi = homogeneous(extract_centers(fi).cast<double>());
+    const auto pj = homogeneous(extract_centers(fj).cast<double>());
+    const auto Mij_tensor = to_tensor(Mij);
+
+    const auto Xij = PointCorrespondenceList{Mij_tensor, pi, pj};
+    const auto data_normalizer =
+        std::make_optional(Normalizer<FundamentalMatrix>{Xij});
+
+    auto inlier_predicate = InlierPredicate<SampsonEpipolarDistance>{};
+    inlier_predicate.err_threshold = err_thres;
+
+    static constexpr auto confidence = 0.99;
+    const auto [F, inliers, sample_best] = v2::ransac(  //
+        Xij,                                            //
+        FSolver{},                                      //
+        inlier_predicate,                               //
+        num_samples, confidence,                        //
+        data_normalizer,                                //
+        true);
+
+    return std::make_tuple(F, inliers, sample_best);
+  }
+
+  auto check_epipolar_constraints(const Image<Rgb8>& Ii, const Image<Rgb8>& Ij,
+                                  const FundamentalMatrix& F,
+                                  const std::vector<Match>& Mij,
+                                  const TensorView_<int, 1>& sample_best,
+                                  const TensorView_<bool, 1>& inliers,
+                                  int display_step, bool wait_key) -> void
+  {
+    const auto scale = 0.25f;
+    const auto w = int((Ii.width() + Ij.width()) * scale + 0.5f);
+    const auto h = int(std::max(Ii.height(), Ij.height()) * scale + 0.5f);
+
+    if (!active_window())
+    {
+      create_window(w, h);
+      set_antialiasing();
+    }
+
+    if (get_sizes(active_window()) != Eigen::Vector2i(w, h))
+      resize_window(w, h);
+
+    PairWiseDrawer drawer(Ii, Ij);
+    drawer.set_viz_params(scale, scale, PairWiseDrawer::CatH);
+
+    drawer.display_images();
+
+    for (auto m = 0; m < static_cast<int>(Mij.size()); ++m)
+    {
+      const Eigen::Vector3d X1 = Mij[m].x_pos().cast<double>().homogeneous();
+      const Eigen::Vector3d X2 = Mij[m].y_pos().cast<double>().homogeneous();
+
+      if (!inliers(m))
+        continue;
+
+      if (m % display_step == 0)
+      {
+        drawer.draw_match(Mij[m], Blue8, false);
+
+        const auto proj_X1 = F.right_epipolar_line(X1);
+        const auto proj_X2 = F.left_epipolar_line(X2);
+
+        drawer.draw_line_from_eqn(0, proj_X2.cast<float>(), Cyan8, 1);
+        drawer.draw_line_from_eqn(1, proj_X1.cast<float>(), Cyan8, 1);
+      }
+    }
+
+    for (auto m = 0; m < static_cast<int>(sample_best.size()); ++m)
+    {
+      // Draw the best elemental subset drawn by RANSAC.
+      drawer.draw_match(Mij[sample_best(m)], Red8, true);
+
+      const Eigen::Vector3d X1 =
+          Mij[sample_best(m)].x_pos().cast<double>().homogeneous();
+      const Eigen::Vector3d X2 =
+          Mij[sample_best(m)].y_pos().cast<double>().homogeneous();
+
+      const auto proj_X1 = F.right_epipolar_line(X1);
+      const auto proj_X2 = F.left_epipolar_line(X2);
+
+      // Draw the corresponding epipolar lines.
+      drawer.draw_line_from_eqn(1, proj_X1.cast<float>(), Magenta8, 1);
+      drawer.draw_line_from_eqn(0, proj_X2.cast<float>(), Magenta8, 1);
+    }
+
+    if (wait_key)
+      get_key();
+  }
+
+} /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.hpp b/cpp/src/DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.hpp
similarity index 73%
rename from cpp/src/DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.hpp
rename to cpp/src/DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.hpp
index 846143171..ee054761a 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/FundamentalMatrixEstimation.hpp
+++ b/cpp/src/DO/Sara/SfM/Helpers/FundamentalMatrixEstimation.hpp
@@ -20,8 +20,7 @@ namespace DO::Sara {
   //! @addtogroup SfM
   //! @{
 
-  //! @{
-  //! @brief Fundamental matrix estimation.
+  //! @brief Helper to estimate the fundamental matrix.
   auto estimate_fundamental_matrix(const std::vector<Match>& Mij,
                                    const KeypointList<OERegion, float>& ki,
                                    const KeypointList<OERegion, float>& kj,
@@ -29,11 +28,7 @@ namespace DO::Sara {
                                    const double err_thres)
       -> std::tuple<FundamentalMatrix, Tensor_<bool, 1>, Tensor_<int, 1>>;
 
-  auto estimate_fundamental_matrices(const std::string& dirpath,
-                                     const std::string& h5_filepath,
-                                     bool overwrite, bool debug,
-                                     bool wait_key = false) -> void;
-
+  //! @brief Inspect visually the epipolar constraints.
   auto check_epipolar_constraints(const Image<Rgb8>& Ii, const Image<Rgb8>& Ij,
                                   const FundamentalMatrix& F,
                                   const std::vector<Match>& Mij,
@@ -42,11 +37,6 @@ namespace DO::Sara {
                                   int display_step, bool wait_key = true)
       -> void;
 
-  auto inspect_fundamental_matrices(const std::string& dirpath,
-                                    const std::string& h5_filepath,
-                                    int display_step, bool wait_key) -> void;
-  //! @}
-
   //! @}
 
 } /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointDetection.hpp b/cpp/src/DO/Sara/SfM/Helpers/KeypointMatching.cpp
similarity index 58%
rename from cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointDetection.hpp
rename to cpp/src/DO/Sara/SfM/Helpers/KeypointMatching.cpp
index 9a163b565..f02c2da4e 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointDetection.hpp
+++ b/cpp/src/DO/Sara/SfM/Helpers/KeypointMatching.cpp
@@ -9,27 +9,19 @@
 // you can obtain one at http://mozilla.org/MPL/2.0/.
 // ========================================================================== //
 
-#pragma once
+#include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
 
-#include <DO/Sara/Defines.hpp>
-
-#include <string>
+#include <DO/Sara/FeatureMatching.hpp>
 
 
 namespace DO::Sara {
 
-  //! @addtogroup SfM
-  //! @{
-
-  //! @{
-  //! @brief Keypoint detection.
-  auto detect_keypoints(const std::string& dirpath,
-                        const std::string& h5_filepath, bool overwrite) -> void;
-
-  auto read_keypoints(const std::string& dirpath,
-                      const std::string& h5_filepath) -> void;
-  //! @}
-
-  //! @}
+  auto match(const KeypointList<OERegion, float>& keys1,
+             const KeypointList<OERegion, float>& keys2, float lowe_ratio)
+      -> std::vector<Match>
+  {
+    AnnMatcher matcher{keys1, keys2, lowe_ratio};
+    return matcher.compute_matches();
+  }
 
 } /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp b/cpp/src/DO/Sara/SfM/Helpers/KeypointMatching.hpp
similarity index 82%
rename from cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp
rename to cpp/src/DO/Sara/SfM/Helpers/KeypointMatching.hpp
index cd5232079..1d8b421c1 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp
+++ b/cpp/src/DO/Sara/SfM/Helpers/KeypointMatching.hpp
@@ -11,7 +11,6 @@
 
 #pragma once
 
-#include <DO/Sara/Defines.hpp>
 #include <DO/Sara/Match.hpp>
 
 
@@ -20,16 +19,11 @@ namespace DO::Sara {
   //! @addtogroup SfM
   //! @{
 
-  //! @{
   //! @brief Keypoint matching.
   auto match(const KeypointList<OERegion, float>& keys1,
              const KeypointList<OERegion, float>& keys2,
              float lowe_ratio = 0.6f) -> std::vector<Match>;
 
-  auto match_keypoints(const std::string& dirpath,
-                       const std::string& h5_filepath, bool overwrite) -> void;
-  //! @}
-
   //! @}
 
 } /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/Triangulation.cpp b/cpp/src/DO/Sara/SfM/Helpers/Triangulation.cpp
similarity index 99%
rename from cpp/src/DO/Sara/SfM/BuildingBlocks/Triangulation.cpp
rename to cpp/src/DO/Sara/SfM/Helpers/Triangulation.cpp
index 5053450e4..9ff32f174 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/Triangulation.cpp
+++ b/cpp/src/DO/Sara/SfM/Helpers/Triangulation.cpp
@@ -12,7 +12,7 @@
 #include <DO/Sara/Core/MultiArray/DataTransformations.hpp>
 #include <DO/Sara/ImageProcessing/Interpolation.hpp>
 #include <DO/Sara/MultiViewGeometry/Miscellaneous.hpp>
-#include <DO/Sara/SfM/BuildingBlocks.hpp>
+#include <DO/Sara/SfM/Helpers.hpp>
 
 
 using namespace std;
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/Triangulation.hpp b/cpp/src/DO/Sara/SfM/Helpers/Triangulation.hpp
similarity index 100%
rename from cpp/src/DO/Sara/SfM/BuildingBlocks/Triangulation.hpp
rename to cpp/src/DO/Sara/SfM/Helpers/Triangulation.hpp
diff --git a/cpp/src/DO/Sara/SfM/Odometry/FeatureTracker.hpp b/cpp/src/DO/Sara/SfM/Odometry/FeatureTracker.hpp
index c52d0c252..2fd475342 100644
--- a/cpp/src/DO/Sara/SfM/Odometry/FeatureTracker.hpp
+++ b/cpp/src/DO/Sara/SfM/Odometry/FeatureTracker.hpp
@@ -3,7 +3,7 @@
 #include <DO/Sara/Core/Image.hpp>
 
 #include <DO/Sara/FeatureDetectors/SIFT.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
+#include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
 
 
 namespace DO::Sara {
diff --git a/cpp/src/DO/Sara/SfM/Odometry/RelativePoseEstimator.hpp b/cpp/src/DO/Sara/SfM/Odometry/RelativePoseEstimator.hpp
index 3361cf3c6..9090e0ea1 100644
--- a/cpp/src/DO/Sara/SfM/Odometry/RelativePoseEstimator.hpp
+++ b/cpp/src/DO/Sara/SfM/Odometry/RelativePoseEstimator.hpp
@@ -10,7 +10,7 @@
 
 #include <DO/Sara/RANSAC/RANSACv2.hpp>
 
-#include <DO/Sara/SfM/BuildingBlocks/KeypointMatching.hpp>
+#include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
 
 
 namespace DO::Sara {
diff --git a/cpp/src/DO/Sara/SfM/Odometry/Triangulator.hpp b/cpp/src/DO/Sara/SfM/Odometry/Triangulator.hpp
index 4c910ba1b..41993c0e1 100644
--- a/cpp/src/DO/Sara/SfM/Odometry/Triangulator.hpp
+++ b/cpp/src/DO/Sara/SfM/Odometry/Triangulator.hpp
@@ -2,7 +2,7 @@
 
 #include <DO/Sara/Core/Tensor.hpp>
 #include <DO/Sara/MultiViewGeometry/PointCorrespondenceList.hpp>
-#include <DO/Sara/SfM/BuildingBlocks/Triangulation.hpp>
+#include <DO/Sara/SfM/Helpers/Triangulation.hpp>
 
 
 namespace DO::Sara {
diff --git a/cpp/src/DO/Sara/SfM/Odometry/VideoStreamer.hpp b/cpp/src/DO/Sara/SfM/Odometry/VideoStreamer.hpp
index 984d9a608..fb1cee361 100644
--- a/cpp/src/DO/Sara/SfM/Odometry/VideoStreamer.hpp
+++ b/cpp/src/DO/Sara/SfM/Odometry/VideoStreamer.hpp
@@ -48,6 +48,11 @@ namespace DO::Sara {
       return _video_stream.height();
     }
 
+    auto frame_number() const -> int
+    {
+      return _frame_index;
+    }
+
     auto skip() const -> bool
     {
       return _frame_index % (_num_skips + 1) != 0;
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
new file mode 100644
index 000000000..a89fa9303
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
@@ -0,0 +1,120 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024-present David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#include <DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp>
+
+#include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
+#include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
+
+
+using namespace DO::Sara;
+
+
+auto v2::OdometryPipeline::set_config(
+    const std::filesystem::path& video_path,
+    const v2::BrownConradyDistortionModel<double>& camera) -> void
+{
+  // Build the dependency graph.
+  _video_streamer.open(video_path);
+  _camera = camera;
+
+  // Computer vision tasks.
+  _distortion_corrector = std::make_unique<ImageDistortionCorrector>(
+      _video_streamer.frame_rgb8(),     //
+      _video_streamer.frame_gray32f(),  //
+      _camera                           //
+  );
+  _relative_pose_estimator.configure(_camera);
+}
+
+auto v2::OdometryPipeline::read() -> bool
+{
+  return _video_streamer.read();
+}
+
+auto v2::OdometryPipeline::process() -> void
+{
+  if (_video_streamer.skip())
+    return;
+
+  _distortion_corrector->undistort();
+}
+
+auto v2::OdometryPipeline::make_display_frame() const -> Image<Rgb8>
+{
+  return _distortion_corrector->frame_rgb8();
+}
+
+auto v2::OdometryPipeline::detect_keypoints() const
+    -> KeypointList<OERegion, float>
+{
+  return compute_sift_keypoints(_distortion_corrector->frame_gray32f(),
+                                _feature_params.image_pyr_params);
+}
+
+auto v2::OdometryPipeline::estimate_relative_pose(
+    const CameraPoseGraph::Vertex u,  //
+    const CameraPoseGraph::Vertex v) const
+    -> std::pair<RelativePoseData, TwoViewGeometry>
+{
+  auto& logger = Logger::get();
+
+  SARA_LOGI(logger, "Matching features...");
+  const auto& keys_u = _pose_graph[u].keypoints;
+  const auto& keys_v = _pose_graph[v].keypoints;
+  if (features(keys_u).empty() || features(keys_v).empty())
+    return;
+
+  auto matches = match(keys_u, keys_v, _feature_params.sift_nn_ratio);
+  if (matches.empty())
+    return;
+  if (matches.size() > _feature_params.num_matches_max)
+    matches.resize(_feature_params.num_matches_max);
+
+  SARA_LOGI(logger, "Estimating relative pose...");
+  auto [geometry, inliers, sample_best] =
+      _relative_pose_estimator.estimate_relative_pose(keys_u, keys_v, matches);
+  const auto num_inliers = inliers.flat_array().count();
+  SARA_LOGI(logger, "inlier count: {}", num_inliers);
+
+  const auto num_inliers = inliers.flat_array().count();
+  SARA_LOGI(logger, "inlier count: {}", num_inliers);
+
+  return std::make_pair(RelativePoseData{.matches = std::move(matches),
+                                         .inliers = std::move(inliers),
+                                         .motion = {}
+
+                        },
+                        geometry);
+}
+
+auto v2::OdometryPipeline::add_camera_pose_and_grow_point_cloud() -> bool
+{
+  auto& logger = Logger::get();
+
+  // Detect and describe the local features.
+  _pose_prev = _pose_curr;
+  const auto frame = _distortion_corrector->frame_gray32f();
+  const auto frame_number = _video_streamer.frame_number();
+  auto keypoints = detect_keypoints(frame);
+  _pose_curr = _pose_graph.add_absolute_pose(std::move(keypoints),  //
+                                             frame_number);
+
+  const auto& pose_data = _pose_graph[_pose_curr];
+  SARA_LOGI(logger, "Camera [frame:{}]: {} keypoints",  //
+            pose_data.image_id, features(pose_data.keypoints).size());
+
+  // We need two frames at least for the epipolar geometry.
+  if (_pose_graph.num_vertices() < 2)
+    return false;
+
+  return false;
+}
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
new file mode 100644
index 000000000..50d1f324e
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
@@ -0,0 +1,76 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024-present David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#pragma once
+
+#include <DO/Sara/Features/KeypointList.hpp>
+#include <DO/Sara/Graphics/ImageDraw.hpp>
+#include <DO/Sara/SfM/Odometry/ImageDistortionCorrector.hpp>
+#include <DO/Sara/SfM/Odometry/VideoStreamer.hpp>
+#include <DO/Sara/Visualization/Features/Draw.hpp>
+
+#include <DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp>
+#include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
+#include <DO/Sara/SfM/Graph/FeatureTracker.hpp>
+
+namespace DO::Sara::v2 {
+
+  class OdometryPipeline
+  {
+  public:
+    auto set_config(const std::filesystem::path& video_path,
+                    const v2::BrownConradyDistortionModel<double>& camera)
+        -> void;
+
+    auto read() -> bool;
+
+    auto process() -> void;
+
+    auto make_display_frame() const -> Image<Rgb8>;
+
+  private: /* computer vision tasks */
+    auto detect_keypoints(const ImageView<float>&) const
+        -> KeypointList<OERegion, float>;
+
+    auto estimate_relative_pose(const CameraPoseGraph::Vertex u,
+                                const CameraPoseGraph::Vertex v) const
+        -> std::pair<RelativePoseData, TwoViewGeometry>;
+
+  private: /* graph update tasks */
+    auto add_camera_pose_and_grow_point_cloud() -> bool;
+
+  private: /* data members */
+    VideoStreamer _video_streamer;
+    v2::BrownConradyDistortionModel<double> _camera;
+
+    std::unique_ptr<ImageDistortionCorrector> _distortion_corrector;
+    v2::RelativePoseEstimator _relative_pose_estimator;
+
+
+    //! @brief SfM data.
+    //! @{
+    FeatureParams _feature_params;
+    FeatureTracker _feature_tracker;
+    CameraPoseGraph _pose_graph;
+    //! @}
+
+    //! @brief SfM state.
+    //! @{
+    CameraPoseGraph::Vertex _pose_prev;
+    CameraPoseGraph::Vertex _pose_curr;
+    CameraPoseGraph::Edge _relative_pose_edge;
+    FeatureTracker::TrackArray _tracks_alive;
+    FeatureTracker::TrackVisibilityCountArray _track_visibility_count;
+    Eigen::Matrix3d _current_global_rotation = Eigen::Matrix3d::Identity();
+    //! @}
+  };
+
+}  // namespace DO::Sara::v2
diff --git a/cpp/test/Sara/MultiViewGeometry/test_multiviewgeometry_feature_graph.cpp b/cpp/test/Sara/MultiViewGeometry/test_multiviewgeometry_feature_graph.cpp
index d10de268a..b4590a55b 100644
--- a/cpp/test/Sara/MultiViewGeometry/test_multiviewgeometry_feature_graph.cpp
+++ b/cpp/test/Sara/MultiViewGeometry/test_multiviewgeometry_feature_graph.cpp
@@ -12,14 +12,15 @@
 #define BOOST_TEST_MODULE "MultiViewGeometry/Geometry/Feature Graph"
 
 #include <DO/Sara/Core/DebugUtilities.hpp>
-#include <DO/Sara/MultiViewGeometry/EpipolarGraph.hpp>
-#include <DO/Sara/MultiViewGeometry/FeatureGraph.hpp>
+#include <DO/Sara/MultiViewGeometry/Graph/EpipolarGraph.hpp>
+#include <DO/Sara/MultiViewGeometry/Graph/FeatureGraph.hpp>
 
-#include <boost/filesystem.hpp>
 #include <boost/test/unit_test.hpp>
 
+#include <filesystem>
 
-namespace fs = boost::filesystem;
+
+namespace fs = std::filesystem;
 using namespace DO::Sara;
 
 
@@ -72,10 +73,10 @@ BOOST_AUTO_TEST_CASE(test_incremental_connected_components)
   auto ds = ICC::initialize_disjoint_sets(rank, parent);
   ICC::initialize_incremental_components(graph, ds);
 
-  for (auto r: rank)
+  for (auto r : rank)
     std::cout << "rank = " << r << std::endl;
 
-  for (auto p: parent)
+  for (auto p : parent)
     std::cout << "p = " << p << std::endl;
 
   auto add_edge = [&](auto u, auto v) {
@@ -187,11 +188,9 @@ BOOST_AUTO_TEST_CASE(test_read_write_feature_graph_to_hdf5)
 
 BOOST_AUTO_TEST_CASE(test_populate_feature_gids)
 {
-  auto keys = std::vector{
-    KeypointList<OERegion, float>{},
-    KeypointList<OERegion, float>{},
-    KeypointList<OERegion, float>{}
-  };
+  auto keys = std::vector{KeypointList<OERegion, float>{},
+                          KeypointList<OERegion, float>{},
+                          KeypointList<OERegion, float>{}};
 
   features(keys[0]).resize(3);
   features(keys[1]).resize(1);
@@ -209,11 +208,9 @@ BOOST_AUTO_TEST_CASE(test_populate_feature_gids)
 
 BOOST_AUTO_TEST_CASE(test_calculate_of_feature_id_offset)
 {
-  auto keys = std::vector{
-    KeypointList<OERegion, float>{},
-    KeypointList<OERegion, float>{},
-    KeypointList<OERegion, float>{}
-  };
+  auto keys = std::vector{KeypointList<OERegion, float>{},
+                          KeypointList<OERegion, float>{},
+                          KeypointList<OERegion, float>{}};
 
   features(keys[0]).resize(3);
   features(keys[1]).resize(1);
@@ -300,7 +297,7 @@ BOOST_AUTO_TEST_CASE(test_populate_feature_tracks)
     const auto& component = components[c];
 
     std::cout << "Component " << c << " : ";
-    for (const auto& v: component)
+    for (const auto& v : component)
       std::cout << "GID[" << v << "] = {" << graph[v].image_id << ", "
                 << graph[v].local_id << "}, ";
     std::cout << std::endl;

From 1b8c491449869a45db6ecce575834700a293d282 Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Thu, 11 Apr 2024 19:18:19 +0100
Subject: [PATCH 28/49] MAINT: fix compile errors.

---
 .../BuildingBlocks/RelativePoseEstimator.cpp  |  6 ---
 .../BuildingBlocks/RelativePoseEstimator.hpp  |  7 +---
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp |  8 ++--
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp |  2 +-
 cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp    |  1 -
 .../Sara/SfM/OdometryV2/OdometryPipeline.cpp  | 39 ++++++++++---------
 .../Sara/SfM/OdometryV2/OdometryPipeline.hpp  | 10 ++---
 7 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.cpp
index 4c7be664e..42b7a44db 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.cpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.cpp
@@ -8,12 +8,6 @@
 
 using namespace DO::Sara::v2;
 
-RelativePoseEstimator::RelativePoseEstimator(
-    const v2::BrownConradyDistortionModel<double>& camera)
-{
-  configure(camera);
-}
-
 auto RelativePoseEstimator::configure(
     const v2::BrownConradyDistortionModel<double>& camera) -> void
 {
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp
index 914ad0b55..2dd996b07 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp
@@ -26,11 +26,8 @@ namespace DO::Sara::v2 {
     Eigen::Matrix3d _K;
     Eigen::Matrix3d _K_inv;
 
-    RelativePoseEstimator(
-        const v2::BrownConradyDistortionModel<double>& camera);
-
-    auto configure(const v2::BrownConradyDistortionModel<double>& camera)
-        -> void;
+    auto
+    configure(const v2::BrownConradyDistortionModel<double>& camera) -> void;
 
     auto estimate_relative_pose(const KeypointList<OERegion, float>& src_keys,
                                 const KeypointList<OERegion, float>& dst_keys,
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
index 0a551fd31..5e8fc1fb5 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
@@ -45,7 +45,7 @@ auto CameraPoseGraph::add_absolute_pose(
 auto CameraPoseGraph::add_relative_pose(
     const v2::RelativePoseEstimator& relative_pose_estimator,  //
     const FeatureParams& feature_params,                       //
-    const Vertex u, const Vertex v) -> void
+    const Vertex u, const Vertex v) -> std::pair<Edge, bool>
 {
   auto& logger = Logger::get();
 
@@ -53,11 +53,11 @@ auto CameraPoseGraph::add_relative_pose(
   const auto& src_keys = _g[u].keypoints;
   const auto& dst_keys = _g[v].keypoints;
   if (features(src_keys).empty() || features(dst_keys).empty())
-    return;
+    return {{}, false};
 
   auto matches = match(src_keys, dst_keys, feature_params.sift_nn_ratio);
   if (matches.empty())
-    return;
+    return {{}, false};
   if (matches.size() > feature_params.num_matches_max)
     matches.resize(feature_params.num_matches_max);
 
@@ -78,4 +78,6 @@ auto CameraPoseGraph::add_relative_pose(
     relative_motion_data.matches = std::move(matches);
     relative_motion_data.inliers = std::move(inliers);
   }
+
+  return {e, edge_added};
 }
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
index 701ef40a5..2792d5e42 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
@@ -93,7 +93,7 @@ namespace DO::Sara {
     auto add_relative_pose(
         const v2::RelativePoseEstimator& relative_pose_estimator,  //
         const FeatureParams& feature_params,                       //
-        const Vertex src, const Vertex dst) -> void;
+        const Vertex src, const Vertex dst) -> std::pair<Edge, bool>;
 
   private:
     //! @brief The graph data structure shortened as g.
diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp b/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp
index c5dd624dd..6d76ccdbe 100644
--- a/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp
@@ -12,7 +12,6 @@
 #pragma once
 
 #include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
-#include <boost/graph/detail/adjacency_list.hpp>
 
 
 namespace DO::Sara {
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
index a89fa9303..a7f1d4734 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
@@ -11,7 +11,12 @@
 
 #include <DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp>
 
-#include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
+#include <DO/Sara/Logging/Logger.hpp>
+
+#include <DO/Sara/Graphics/ImageDraw.hpp>
+#include <DO/Sara/Visualization/Features/Draw.hpp>
+
+#include <DO/Sara/FeatureDetectors/SIFT.hpp>
 #include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
 
 
@@ -53,29 +58,28 @@ auto v2::OdometryPipeline::make_display_frame() const -> Image<Rgb8>
   return _distortion_corrector->frame_rgb8();
 }
 
-auto v2::OdometryPipeline::detect_keypoints() const
+auto v2::OdometryPipeline::detect_keypoints(const ImageView<float>& image) const
     -> KeypointList<OERegion, float>
 {
-  return compute_sift_keypoints(_distortion_corrector->frame_gray32f(),
-                                _feature_params.image_pyr_params);
+  return compute_sift_keypoints(image, _feature_params.image_pyr_params);
 }
 
 auto v2::OdometryPipeline::estimate_relative_pose(
-    const CameraPoseGraph::Vertex u,  //
-    const CameraPoseGraph::Vertex v) const
+    const CameraPoseGraph::Vertex pose_u,  //
+    const CameraPoseGraph::Vertex pose_v) const
     -> std::pair<RelativePoseData, TwoViewGeometry>
 {
   auto& logger = Logger::get();
 
   SARA_LOGI(logger, "Matching features...");
-  const auto& keys_u = _pose_graph[u].keypoints;
-  const auto& keys_v = _pose_graph[v].keypoints;
+  const auto& keys_u = _pose_graph[pose_u].keypoints;
+  const auto& keys_v = _pose_graph[pose_v].keypoints;
   if (features(keys_u).empty() || features(keys_v).empty())
-    return;
+    return {};
 
   auto matches = match(keys_u, keys_v, _feature_params.sift_nn_ratio);
   if (matches.empty())
-    return;
+    return {};
   if (matches.size() > _feature_params.num_matches_max)
     matches.resize(_feature_params.num_matches_max);
 
@@ -85,15 +89,14 @@ auto v2::OdometryPipeline::estimate_relative_pose(
   const auto num_inliers = inliers.flat_array().count();
   SARA_LOGI(logger, "inlier count: {}", num_inliers);
 
-  const auto num_inliers = inliers.flat_array().count();
-  SARA_LOGI(logger, "inlier count: {}", num_inliers);
-
-  return std::make_pair(RelativePoseData{.matches = std::move(matches),
-                                         .inliers = std::move(inliers),
-                                         .motion = {}
+  return {
+      RelativePoseData{.matches = std::move(matches),
+                       .inliers = std::move(inliers),
+                       .motion = {}
 
-                        },
-                        geometry);
+      },
+      geometry  //
+  };
 }
 
 auto v2::OdometryPipeline::add_camera_pose_and_grow_point_cloud() -> bool
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
index 50d1f324e..7ca06be46 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
@@ -12,10 +12,8 @@
 #pragma once
 
 #include <DO/Sara/Features/KeypointList.hpp>
-#include <DO/Sara/Graphics/ImageDraw.hpp>
 #include <DO/Sara/SfM/Odometry/ImageDistortionCorrector.hpp>
 #include <DO/Sara/SfM/Odometry/VideoStreamer.hpp>
-#include <DO/Sara/Visualization/Features/Draw.hpp>
 
 #include <DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp>
 #include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
@@ -26,9 +24,9 @@ namespace DO::Sara::v2 {
   class OdometryPipeline
   {
   public:
-    auto set_config(const std::filesystem::path& video_path,
-                    const v2::BrownConradyDistortionModel<double>& camera)
-        -> void;
+    auto
+    set_config(const std::filesystem::path& video_path,
+               const v2::BrownConradyDistortionModel<double>& camera) -> void;
 
     auto read() -> bool;
 
@@ -47,7 +45,7 @@ namespace DO::Sara::v2 {
   private: /* graph update tasks */
     auto add_camera_pose_and_grow_point_cloud() -> bool;
 
-  private: /* data members */
+  public: /* data members */
     VideoStreamer _video_streamer;
     v2::BrownConradyDistortionModel<double> _camera;
 

From 4530b2d72ea61a49ffbdf0d160229e82f699106c Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Sat, 13 Apr 2024 11:09:14 +0100
Subject: [PATCH 29/49] WIP: save work.

---
 .../Sara/SfM/BuildingBlocks/FeatureParams.hpp |  2 +-
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp | 41 +-------
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp | 11 ++-
 .../Sara/SfM/OdometryV2/OdometryPipeline.cpp  | 94 +++++++++++++++++--
 .../Sara/SfM/OdometryV2/OdometryPipeline.hpp  | 10 +-
 5 files changed, 105 insertions(+), 53 deletions(-)

diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/FeatureParams.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/FeatureParams.hpp
index 7ee194712..17303cc6b 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/FeatureParams.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/FeatureParams.hpp
@@ -10,7 +10,7 @@ namespace DO::Sara {
     ImagePyramidParams image_pyr_params = ImagePyramidParams(0);
     float sift_nn_ratio = 0.6f;
     std::size_t num_matches_max = 1000u;
-    std::size_t num_inliers_min = 100u;
+    int num_inliers_min = 100;
   };
 
 }  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
index 5e8fc1fb5..6cff045be 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
@@ -43,41 +43,10 @@ auto CameraPoseGraph::add_absolute_pose(
 }
 
 auto CameraPoseGraph::add_relative_pose(
-    const v2::RelativePoseEstimator& relative_pose_estimator,  //
-    const FeatureParams& feature_params,                       //
-    const Vertex u, const Vertex v) -> std::pair<Edge, bool>
+    const RelativePoseData& relative_pose_data,  //
+    const Vertex u, const Vertex v) -> bool
 {
-  auto& logger = Logger::get();
-
-  SARA_LOGI(logger, "Match features...");
-  const auto& src_keys = _g[u].keypoints;
-  const auto& dst_keys = _g[v].keypoints;
-  if (features(src_keys).empty() || features(dst_keys).empty())
-    return {{}, false};
-
-  auto matches = match(src_keys, dst_keys, feature_params.sift_nn_ratio);
-  if (matches.empty())
-    return {{}, false};
-  if (matches.size() > feature_params.num_matches_max)
-    matches.resize(feature_params.num_matches_max);
-
-  SARA_LOGI(logger, "Estimating relative pose...");
-  auto [geometry, inliers, sample_best] =
-      relative_pose_estimator.estimate_relative_pose(src_keys, dst_keys,
-                                                     matches);
-  const auto num_inliers = inliers.flat_array().count();
-  SARA_LOGI(logger, "inlier count: {}", num_inliers);
-
-  const auto success = num_inliers > 100;
-  auto e = Edge{};
-  auto edge_added = false;
-  if (success)
-  {
-    std::tie(e, edge_added) = boost::add_edge(u, v, _g);
-    auto& relative_motion_data = _g[e];
-    relative_motion_data.matches = std::move(matches);
-    relative_motion_data.inliers = std::move(inliers);
-  }
-
-  return {e, edge_added};
+  const auto [e, edge_added] = boost::add_edge(u, v, _g);
+  _g[e] = relative_pose_data;
+  return edge_added;
 }
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
index 2792d5e42..aeee5ed0c 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
@@ -82,6 +82,11 @@ namespace DO::Sara {
       return _g[e];
     }
 
+    auto edge(const Vertex u, const Vertex v) const -> std::pair<Edge, bool>
+    {
+      return boost::edge(u, v, _g);
+    }
+
     auto num_vertices() const -> VertexIndex
     {
       return boost::num_vertices(_g);
@@ -90,10 +95,8 @@ namespace DO::Sara {
     auto add_absolute_pose(KeypointList<OERegion, float>&& keypoints,
                            const int image_id) -> Vertex;
 
-    auto add_relative_pose(
-        const v2::RelativePoseEstimator& relative_pose_estimator,  //
-        const FeatureParams& feature_params,                       //
-        const Vertex src, const Vertex dst) -> std::pair<Edge, bool>;
+    auto add_relative_pose(const RelativePoseData& relative_pose_data,  //
+                           const Vertex src, const Vertex dst) -> bool;
 
   private:
     //! @brief The graph data structure shortened as g.
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
index a7f1d4734..5befdcb94 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
@@ -61,6 +61,8 @@ auto v2::OdometryPipeline::make_display_frame() const -> Image<Rgb8>
 auto v2::OdometryPipeline::detect_keypoints(const ImageView<float>& image) const
     -> KeypointList<OERegion, float>
 {
+  auto& logger = Logger::get();
+  SARA_LOGI(logger, "[Feature Detection] Matching image keypoints...");
   return compute_sift_keypoints(image, _feature_params.image_pyr_params);
 }
 
@@ -71,34 +73,86 @@ auto v2::OdometryPipeline::estimate_relative_pose(
 {
   auto& logger = Logger::get();
 
-  SARA_LOGI(logger, "Matching features...");
   const auto& keys_u = _pose_graph[pose_u].keypoints;
   const auto& keys_v = _pose_graph[pose_v].keypoints;
   if (features(keys_u).empty() || features(keys_v).empty())
+  {
+    SARA_LOGI(logger, "[Relative Pose] Skipped image matching...");
     return {};
+  }
 
   auto matches = match(keys_u, keys_v, _feature_params.sift_nn_ratio);
+  SARA_LOGI(logger, "[Relative Pose] Matched image keypoints...");
   if (matches.empty())
     return {};
   if (matches.size() > _feature_params.num_matches_max)
     matches.resize(_feature_params.num_matches_max);
 
-  SARA_LOGI(logger, "Estimating relative pose...");
-  auto [geometry, inliers, sample_best] =
+  auto [two_view_geometry, inliers, sample_best] =
       _relative_pose_estimator.estimate_relative_pose(keys_u, keys_v, matches);
-  const auto num_inliers = inliers.flat_array().count();
-  SARA_LOGI(logger, "inlier count: {}", num_inliers);
+  SARA_LOGI(logger, "[Relative Pose] Estimated relative pose...");
 
-  return {
+  const auto res = std::pair{
       RelativePoseData{.matches = std::move(matches),
                        .inliers = std::move(inliers),
-                       .motion = {}
+                       .motion =
+                           {
+                               .R = two_view_geometry.C2.R,  //
+                               .t = two_view_geometry.C2.t   //
+                           }
 
       },
-      geometry  //
+      two_view_geometry  //
   };
+
+  return res;
+}
+
+auto v2::OdometryPipeline::update_absolute_pose_from_latest_relative_pose_data(
+    const RelativePoseData& relative_pose_data,
+    const TwoViewGeometry& two_view_geometry) -> bool
+{
+  auto& logger = Logger::get();
+
+  const auto num_inliers = relative_pose_data.inliers.flat_array().count();
+  SARA_LOGI(logger, "[SfM] Relative pose inliers: {} 3D points", num_inliers);
+  if (num_inliers < _feature_params.num_inliers_min)
+  {
+    SARA_LOGI(logger, "[SfM] Relative pose failed!");
+    return false;
+  }
+  SARA_LOGI(logger, "[SfM] Relative pose succeeded!");
+
+  if (_pose_graph.num_vertices() == 2)
+  {
+    SARA_LOGI(logger, "Initializing the first two camera poses...");
+    // Set the absolute pose of the first camera which is the identity rigid
+    // body transformation.
+    auto& initial_pose = _pose_graph[_pose_prev].pose;
+    {
+      initial_pose.q.setIdentity();
+      initial_pose.t.setZero();
+    }
+
+    // Set the absolute pose of the first camera as the first relative pose.
+    auto& second_pose = _pose_graph[_pose_curr].pose;
+    {
+      // HEURISTICS: Advance from only 5 cm at most to view something nice.
+      const auto [e, edge_exists] = _pose_graph.edge(_pose_prev, _pose_curr);
+      if (!edge_exists)
+        throw std::runtime_error{"Edge must exist!"};
+
+      _pose_graph[e] = relative_pose_data;
+    }
+
+    // TODO: make a new function for this.
+    // SARA_LOGI(logger, "Initializing the point cloud...");
+    // _point_cloud_operator->init_point_cloud(_tracks_alive, current_image,
+    //                                         _relative_pose_edge_id, _camera);
+  }
 }
 
+
 auto v2::OdometryPipeline::add_camera_pose_and_grow_point_cloud() -> bool
 {
   auto& logger = Logger::get();
@@ -112,12 +166,34 @@ auto v2::OdometryPipeline::add_camera_pose_and_grow_point_cloud() -> bool
                                              frame_number);
 
   const auto& pose_data = _pose_graph[_pose_curr];
-  SARA_LOGI(logger, "Camera [frame:{}]: {} keypoints",  //
+  SARA_LOGI(logger,
+            "[SfM] Initialized new camera pose[frame:{}]: {} keypoints",  //
             pose_data.image_id, features(pose_data.keypoints).size());
 
   // We need two frames at least for the epipolar geometry.
   if (_pose_graph.num_vertices() < 2)
     return false;
 
+  const auto [relative_pose_data, two_view_geometry] =
+      this->estimate_relative_pose(_pose_prev, _pose_curr);
+  const auto num_inliers = relative_pose_data.inliers.flat_array().count();
+  SARA_LOGI(logger, "[SfM] Relative pose inliers: {} 3D points", num_inliers);
+  if (num_inliers < _feature_params.num_inliers_min)
+  {
+    SARA_LOGI(logger, "[SfM] Relative pose failed!");
+    return false;
+  }
+  SARA_LOGI(logger, "[SfM] Relative pose succeeded!");
+
+  if (_pose_graph.num_vertices() == 2)
+  {
+    // Init point cloud.
+  }
+  else
+  {
+    // Grow point cloud by triangulation.
+  }
+
+
   return false;
 }
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
index 7ca06be46..455095179 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
@@ -24,9 +24,9 @@ namespace DO::Sara::v2 {
   class OdometryPipeline
   {
   public:
-    auto
-    set_config(const std::filesystem::path& video_path,
-               const v2::BrownConradyDistortionModel<double>& camera) -> void;
+    auto set_config(const std::filesystem::path& video_path,
+                    const v2::BrownConradyDistortionModel<double>& camera)
+        -> void;
 
     auto read() -> bool;
 
@@ -42,6 +42,10 @@ namespace DO::Sara::v2 {
                                 const CameraPoseGraph::Vertex v) const
         -> std::pair<RelativePoseData, TwoViewGeometry>;
 
+    auto update_absolute_pose_from_latest_relative_pose_data(
+        const RelativePoseData& relative_pose_data,
+        const TwoViewGeometry& two_view_geometry) -> bool;
+
   private: /* graph update tasks */
     auto add_camera_pose_and_grow_point_cloud() -> bool;
 

From fd7c4091f774b45c82180ab697f646cd63364a7d Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Sun, 14 Apr 2024 18:43:16 +0100
Subject: [PATCH 30/49] WIP: refactor code.

---
 .../Geometry/QuaternionBasedPose.hpp          |  15 +-
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp |  30 ++--
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp |  24 ++-
 .../Sara/SfM/OdometryV2/OdometryPipeline.cpp  | 143 ++++++++----------
 .../Sara/SfM/OdometryV2/OdometryPipeline.hpp  |  21 ++-
 5 files changed, 119 insertions(+), 114 deletions(-)

diff --git a/cpp/src/DO/Sara/MultiViewGeometry/Geometry/QuaternionBasedPose.hpp b/cpp/src/DO/Sara/MultiViewGeometry/Geometry/QuaternionBasedPose.hpp
index ac331c983..be3af6040 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/Geometry/QuaternionBasedPose.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/Geometry/QuaternionBasedPose.hpp
@@ -12,8 +12,7 @@ namespace DO::Sara {
     Eigen::Quaternion<T> q;
     Eigen::Vector3<T> t;
 
-    inline auto operator()(const Eigen::Vector3<T>& x) const
-        -> Eigen::Vector3<T>
+    inline auto operator*(const Eigen::Vector3<T>& x) const -> Eigen::Vector3<T>
     {
       return q * x + t;
     }
@@ -25,9 +24,15 @@ namespace DO::Sara {
 
     inline auto matrix4() const -> Eigen::Matrix4<T>
     {
-      return (Eigen::Matrix<T, 4, 4>{} <<
-              matrix34(), Eigen::RowVector3<T>::Zero().homogeneous())
-          .finished();
+      auto r = Eigen::Matrix<T, 4, 4>{};
+      r << matrix34(), Eigen::RowVector3<T>::Zero().homogeneous();
+      return r;
+    }
+
+    static inline auto identity() -> QuaternionBasedPose<T>
+    {
+      return {.q = Eigen::Quaternion<T>::Identity(),
+              .t = Eigen::Vector3<T>::Zero()};
     }
   };
 
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
index 6cff045be..37b3b784f 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
@@ -20,33 +20,33 @@
 using namespace DO::Sara;
 
 
-auto CameraPoseGraph::add_absolute_pose(
-    KeypointList<OERegion, float>&& keypoints,  //
-    const int image_id) -> CameraPoseGraph::Vertex
+auto CameraPoseGraph::add_absolute_pose(AbsolutePoseData&& data)
+    -> CameraPoseGraph::Vertex
 {
   auto& logger = Logger::get();
 
-  SARA_LOGI(logger, "Detecting keypoints for image frame {}", image_id);
-
   // Grow the pose graph by creating a new camera vertex.
   const auto v = boost::add_vertex(_g);
 
   // Store the camera pose data.
-  auto& pose_data = _g[v];
-  pose_data.image_id = image_id;
-  pose_data.keypoints = std::move(keypoints);
+  _g[v] = std::move(data);
 
-  const auto& f = features(pose_data.keypoints);
-  SARA_LOGI(logger, "Camera {}: {} keypoints", v, f.size());
+  SARA_LOGI(logger,
+            "[SfM] Added camera absolute pose[frame:{}]:\n"
+            "Keypoints: {} points\n"
+            "Absolute pose: {}\n",             //
+            _g[v].image_id,                    //
+            features(_g[v].keypoints).size(),  //
+            _g[v].pose.matrix34());
 
   return v;
 }
 
-auto CameraPoseGraph::add_relative_pose(
-    const RelativePoseData& relative_pose_data,  //
-    const Vertex u, const Vertex v) -> bool
+auto CameraPoseGraph::add_relative_pose(const Vertex u, const Vertex v,
+                                        RelativePoseData&& relative_pose_data)
+    -> CameraPoseGraph::Edge
 {
   const auto [e, edge_added] = boost::add_edge(u, v, _g);
-  _g[e] = relative_pose_data;
-  return edge_added;
+  _g[e] = std::move(relative_pose_data);
+  return e;
 }
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
index aeee5ed0c..5ba8584ce 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
@@ -33,6 +33,23 @@ namespace DO::Sara {
     KeypointList<OERegion, float> keypoints;
     //! @brief "Absolute" pose w.r.t. some reference frame.
     QuaternionBasedPose<double> pose;
+
+    AbsolutePoseData() = default;
+
+    AbsolutePoseData(const AbsolutePoseData&) = default;
+
+    AbsolutePoseData(AbsolutePoseData&&) = default;
+
+    AbsolutePoseData(const int image_id,
+                     KeypointList<OERegion, float>&& keypoints,
+                     QuaternionBasedPose<double>&& pose)
+      : image_id{image_id}
+      , keypoints{std::move(keypoints)}
+      , pose{std::move(pose)}
+    {
+    }
+
+    auto operator=(AbsolutePoseData&&) -> AbsolutePoseData& = default;
   };
 
   struct RelativePoseData
@@ -92,11 +109,10 @@ namespace DO::Sara {
       return boost::num_vertices(_g);
     }
 
-    auto add_absolute_pose(KeypointList<OERegion, float>&& keypoints,
-                           const int image_id) -> Vertex;
+    auto add_absolute_pose(AbsolutePoseData&& data) -> Vertex;
 
-    auto add_relative_pose(const RelativePoseData& relative_pose_data,  //
-                           const Vertex src, const Vertex dst) -> bool;
+    auto add_relative_pose(const Vertex src, const Vertex dst,
+                           RelativePoseData&& relative_pose_data) -> Edge;
 
   private:
     //! @brief The graph data structure shortened as g.
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
index 5befdcb94..92006bce8 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
@@ -51,6 +51,8 @@ auto v2::OdometryPipeline::process() -> void
     return;
 
   _distortion_corrector->undistort();
+
+  add_camera_pose();
 }
 
 auto v2::OdometryPipeline::make_display_frame() const -> Image<Rgb8>
@@ -67,21 +69,19 @@ auto v2::OdometryPipeline::detect_keypoints(const ImageView<float>& image) const
 }
 
 auto v2::OdometryPipeline::estimate_relative_pose(
-    const CameraPoseGraph::Vertex pose_u,  //
-    const CameraPoseGraph::Vertex pose_v) const
+    const KeypointList<OERegion, float>& keys_src,
+    const KeypointList<OERegion, float>& keys_dst) const
     -> std::pair<RelativePoseData, TwoViewGeometry>
 {
   auto& logger = Logger::get();
 
-  const auto& keys_u = _pose_graph[pose_u].keypoints;
-  const auto& keys_v = _pose_graph[pose_v].keypoints;
-  if (features(keys_u).empty() || features(keys_v).empty())
+  if (features(keys_src).empty() || features(keys_dst).empty())
   {
     SARA_LOGI(logger, "[Relative Pose] Skipped image matching...");
     return {};
   }
 
-  auto matches = match(keys_u, keys_v, _feature_params.sift_nn_ratio);
+  auto matches = match(keys_src, keys_dst, _feature_params.sift_nn_ratio);
   SARA_LOGI(logger, "[Relative Pose] Matched image keypoints...");
   if (matches.empty())
     return {};
@@ -89,10 +89,11 @@ auto v2::OdometryPipeline::estimate_relative_pose(
     matches.resize(_feature_params.num_matches_max);
 
   auto [two_view_geometry, inliers, sample_best] =
-      _relative_pose_estimator.estimate_relative_pose(keys_u, keys_v, matches);
+      _relative_pose_estimator.estimate_relative_pose(keys_src, keys_dst,
+                                                      matches);
   SARA_LOGI(logger, "[Relative Pose] Estimated relative pose...");
 
-  const auto res = std::pair{
+  const auto res = std::make_pair(  //
       RelativePoseData{.matches = std::move(matches),
                        .inliers = std::move(inliers),
                        .motion =
@@ -102,98 +103,82 @@ auto v2::OdometryPipeline::estimate_relative_pose(
                            }
 
       },
-      two_view_geometry  //
-  };
+      std::move(two_view_geometry)  //
+  );
 
   return res;
 }
 
-auto v2::OdometryPipeline::update_absolute_pose_from_latest_relative_pose_data(
-    const RelativePoseData& relative_pose_data,
-    const TwoViewGeometry& two_view_geometry) -> bool
+auto v2::OdometryPipeline::add_camera_pose() -> bool
 {
   auto& logger = Logger::get();
 
-  const auto num_inliers = relative_pose_data.inliers.flat_array().count();
-  SARA_LOGI(logger, "[SfM] Relative pose inliers: {} 3D points", num_inliers);
-  if (num_inliers < _feature_params.num_inliers_min)
+  // Detect and describe the local features.
+  _pose_prev = _pose_curr;
+
+  const auto frame = _distortion_corrector->frame_gray32f();
+  const auto frame_number = _video_streamer.frame_number();
+  auto keys_curr = detect_keypoints(frame);
+
+  if (_pose_graph.num_vertices() == 1)
   {
-    SARA_LOGI(logger, "[SfM] Relative pose failed!");
-    return false;
+    // Initialize the new camera pose from the latest image frame.
+    auto abs_pose_curr = QuaternionBasedPose<double>::identity();
+    auto abs_pose_data = AbsolutePoseData{
+        frame_number,             //
+        std::move(keys_curr),     //
+        std::move(abs_pose_curr)  //
+    };
+    _pose_curr = _pose_graph.add_absolute_pose(std::move(abs_pose_data));
+
+    return true;
   }
-  SARA_LOGI(logger, "[SfM] Relative pose succeeded!");
-
-  if (_pose_graph.num_vertices() == 2)
+  else
   {
-    SARA_LOGI(logger, "Initializing the first two camera poses...");
-    // Set the absolute pose of the first camera which is the identity rigid
-    // body transformation.
-    auto& initial_pose = _pose_graph[_pose_prev].pose;
+    const auto& keys_prev = _pose_graph[_pose_prev].keypoints;
+    auto [rel_pose_data, two_view_geometry] =
+        estimate_relative_pose(keys_prev, keys_curr);
+    const auto num_inliers = rel_pose_data.inliers.flat_array().count();
+    SARA_LOGI(logger, "[SfM] Relative pose inliers: {} 3D points", num_inliers);
+    if (num_inliers < _feature_params.num_inliers_min)
     {
-      initial_pose.q.setIdentity();
-      initial_pose.t.setZero();
+      SARA_LOGI(logger, "[SfM] Relative pose failed!");
+      return false;
     }
+    SARA_LOGI(logger, "[SfM] Relative pose succeeded!");
 
-    // Set the absolute pose of the first camera as the first relative pose.
-    auto& second_pose = _pose_graph[_pose_curr].pose;
+    if (_pose_graph.num_vertices() == 2)
     {
-      // HEURISTICS: Advance from only 5 cm at most to view something nice.
-      const auto [e, edge_exists] = _pose_graph.edge(_pose_prev, _pose_curr);
-      if (!edge_exists)
-        throw std::runtime_error{"Edge must exist!"};
+      auto abs_pose_curr = QuaternionBasedPose<double>{
+          .q = Eigen::Quaterniond{rel_pose_data.motion.R},
+          .t = rel_pose_data.motion.t  //
+      };
 
-      _pose_graph[e] = relative_pose_data;
-    }
+      auto abs_pose_data = AbsolutePoseData{
+          frame_number,             //
+          std::move(keys_curr),     //
+          std::move(abs_pose_curr)  //
+      };
 
-    // TODO: make a new function for this.
-    // SARA_LOGI(logger, "Initializing the point cloud...");
-    // _point_cloud_operator->init_point_cloud(_tracks_alive, current_image,
-    //                                         _relative_pose_edge_id, _camera);
-  }
-}
+      // 1. Add the absolute pose vertex.
+      _pose_graph.add_absolute_pose(std::move(abs_pose_data));
 
+      // 2. Add the pose edge, which will invalidate the relative pose data.
+      _pose_graph.add_relative_pose(_pose_prev, _pose_curr,
+                                    std::move(rel_pose_data));
 
-auto v2::OdometryPipeline::add_camera_pose_and_grow_point_cloud() -> bool
-{
-  auto& logger = Logger::get();
+      // 3. TODO: Init point cloud
 
-  // Detect and describe the local features.
-  _pose_prev = _pose_curr;
-  const auto frame = _distortion_corrector->frame_gray32f();
-  const auto frame_number = _video_streamer.frame_number();
-  auto keypoints = detect_keypoints(frame);
-  _pose_curr = _pose_graph.add_absolute_pose(std::move(keypoints),  //
-                                             frame_number);
-
-  const auto& pose_data = _pose_graph[_pose_curr];
-  SARA_LOGI(logger,
-            "[SfM] Initialized new camera pose[frame:{}]: {} keypoints",  //
-            pose_data.image_id, features(pose_data.keypoints).size());
-
-  // We need two frames at least for the epipolar geometry.
-  if (_pose_graph.num_vertices() < 2)
-    return false;
-
-  const auto [relative_pose_data, two_view_geometry] =
-      this->estimate_relative_pose(_pose_prev, _pose_curr);
-  const auto num_inliers = relative_pose_data.inliers.flat_array().count();
-  SARA_LOGI(logger, "[SfM] Relative pose inliers: {} 3D points", num_inliers);
-  if (num_inliers < _feature_params.num_inliers_min)
-  {
-    SARA_LOGI(logger, "[SfM] Relative pose failed!");
-    return false;
-  }
-  SARA_LOGI(logger, "[SfM] Relative pose succeeded!");
+      return true;
+    }
+    else
+    {
+      // 1. Add the absolute pose vertex.
 
-  if (_pose_graph.num_vertices() == 2)
-  {
-    // Init point cloud.
-  }
-  else
-  {
-    // Grow point cloud by triangulation.
+      // TODO: Grow point cloud by triangulation.
+      return false;
+    }
   }
 
-
   return false;
 }
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
index 455095179..acead5370 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
@@ -24,9 +24,9 @@ namespace DO::Sara::v2 {
   class OdometryPipeline
   {
   public:
-    auto set_config(const std::filesystem::path& video_path,
-                    const v2::BrownConradyDistortionModel<double>& camera)
-        -> void;
+    auto
+    set_config(const std::filesystem::path& video_path,
+               const v2::BrownConradyDistortionModel<double>& camera) -> void;
 
     auto read() -> bool;
 
@@ -34,20 +34,19 @@ namespace DO::Sara::v2 {
 
     auto make_display_frame() const -> Image<Rgb8>;
 
+    auto detect_keypoints() -> const KeypointList<OERegion, float>&;
+    auto estimate_relative_pose() -> const RelativePoseData&;
+
   private: /* computer vision tasks */
     auto detect_keypoints(const ImageView<float>&) const
         -> KeypointList<OERegion, float>;
 
-    auto estimate_relative_pose(const CameraPoseGraph::Vertex u,
-                                const CameraPoseGraph::Vertex v) const
-        -> std::pair<RelativePoseData, TwoViewGeometry>;
-
-    auto update_absolute_pose_from_latest_relative_pose_data(
-        const RelativePoseData& relative_pose_data,
-        const TwoViewGeometry& two_view_geometry) -> bool;
+    auto estimate_relative_pose(const KeypointList<OERegion, float>& keys_src,
+                                const KeypointList<OERegion, float>& keys_dst)
+        const -> std::pair<RelativePoseData, TwoViewGeometry>;
 
   private: /* graph update tasks */
-    auto add_camera_pose_and_grow_point_cloud() -> bool;
+    auto add_camera_pose() -> bool;
 
   public: /* data members */
     VideoStreamer _video_streamer;

From 24a7ebd06e69db5dada5708caa7e012bfb7c73fd Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Sun, 14 Apr 2024 19:28:44 +0100
Subject: [PATCH 31/49] MAINT: remove audio stream code.

It does not compile anymore with FFmpeg 7.0 and we are not encoding any
audio data anyways.
---
 cpp/src/DO/Sara/VideoIO/VideoWriter.cpp | 325 ------------------------
 cpp/src/DO/Sara/VideoIO/VideoWriter.hpp |   3 -
 2 files changed, 328 deletions(-)

diff --git a/cpp/src/DO/Sara/VideoIO/VideoWriter.cpp b/cpp/src/DO/Sara/VideoIO/VideoWriter.cpp
index b6620f213..789cffdb4 100644
--- a/cpp/src/DO/Sara/VideoIO/VideoWriter.cpp
+++ b/cpp/src/DO/Sara/VideoIO/VideoWriter.cpp
@@ -51,7 +51,6 @@ av_always_inline char* av_err2str(int errnum)
 }
 #endif
 
-static constexpr auto STREAM_FRAME_RATE = 25;              /* 25 images/s */
 static constexpr auto STREAM_PIX_FMT = AV_PIX_FMT_YUV420P; /* default pix_fmt */
 
 
@@ -112,90 +111,6 @@ namespace DO::Sara {
 
   // ======================================================================== //
   // Add an output stream.
-  static void add_stream(OutputStream* out_stream, AVFormatContext* out_context,
-                         const AVCodec** codec, enum AVCodecID codec_id)
-  {
-    AVCodecContext* c;
-    /* find the encoder */
-    *codec = avcodec_find_encoder(codec_id);
-    if (!(*codec))
-      throw std::runtime_error{format("Could not find encoder for '%s'",
-                                      avcodec_get_name(codec_id))};
-    out_stream->stream = avformat_new_stream(out_context, nullptr);
-    if (!out_stream->stream)
-      throw std::runtime_error{"Could not allocate stream"};
-
-    out_stream->stream->id = out_context->nb_streams - 1;
-    c = avcodec_alloc_context3(*codec);
-    if (!c)
-      throw std::runtime_error{"Could not alloc an encoding context"};
-
-    out_stream->encoding_context = c;
-    switch ((*codec)->type)
-    {
-    case AVMEDIA_TYPE_AUDIO:
-      c->sample_fmt =
-          (*codec)->sample_fmts ? (*codec)->sample_fmts[0] : AV_SAMPLE_FMT_FLTP;
-      c->bit_rate = 64000;
-      c->sample_rate = 44100;
-      if ((*codec)->supported_samplerates)
-      {
-        c->sample_rate = (*codec)->supported_samplerates[0];
-        for (int i = 0; (*codec)->supported_samplerates[i]; ++i)
-        {
-          if ((*codec)->supported_samplerates[i] == 44100)
-            c->sample_rate = 44100;
-        }
-      }
-      c->channels = av_get_channel_layout_nb_channels(c->channel_layout);
-      c->channel_layout = AV_CH_LAYOUT_STEREO;
-      if ((*codec)->channel_layouts)
-      {
-        c->channel_layout = (*codec)->channel_layouts[0];
-        for (int i = 0; (*codec)->channel_layouts[i]; ++i)
-        {
-          if ((*codec)->channel_layouts[i] == AV_CH_LAYOUT_STEREO)
-            c->channel_layout = AV_CH_LAYOUT_STEREO;
-        }
-      }
-      c->channels = av_get_channel_layout_nb_channels(c->channel_layout);
-      out_stream->stream->time_base = AVRational{1, c->sample_rate};
-      break;
-    case AVMEDIA_TYPE_VIDEO:
-      c->codec_id = codec_id;
-      c->bit_rate = 400000;
-      /* Resolution must be a multiple of two. */
-      c->width = 352;
-      c->height = 288;
-      /* timebase: This is the fundamental unit of time (in seconds) in terms
-       * of which frame timestamps are represented. For fixed-fps content,
-       * timebase should be 1/framerate and timestamp increments should be
-       * identical to 1. */
-      out_stream->stream->time_base = {1, STREAM_FRAME_RATE};
-      c->time_base = out_stream->stream->time_base;
-      c->gop_size = 12; /* emit one intra frame every twelve frames at most */
-      c->pix_fmt = STREAM_PIX_FMT;
-      if (c->codec_id == AV_CODEC_ID_MPEG2VIDEO)
-      {
-        /* just for testing, we also add B-frames */
-        c->max_b_frames = 2;
-      }
-      if (c->codec_id == AV_CODEC_ID_MPEG1VIDEO)
-      {
-        /* Needed to avoid using macroblocks in which some coeffs overflow.
-         * This does not happen with normal video, it just happens here as
-         * the motion of the chroma plane does not match the luma plane. */
-        c->mb_decision = 2;
-      }
-      break;
-    default:
-      break;
-    }
-    /* Some formats want stream headers to be separate. */
-    if (out_context->oformat->flags & AVFMT_GLOBALHEADER)
-      c->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
-  }
-
   static void add_video_stream(OutputStream* ostream,
                                AVFormatContext* format_context,
                                const AVCodec** codec,
@@ -252,155 +167,6 @@ namespace DO::Sara {
   }
 
 
-  // ======================================================================== //
-  // Audio output
-  //
-  static AVFrame* alloc_audio_frame(enum AVSampleFormat sample_fmt,
-                                    uint64_t channel_layout, int sample_rate,
-                                    int nb_samples)
-  {
-    AVFrame* frame = av_frame_alloc();
-    if (!frame)
-      throw std::runtime_error{"Error allocating an audio frame"};
-
-    frame->format = sample_fmt;
-    frame->channel_layout = channel_layout;
-    frame->sample_rate = sample_rate;
-    frame->nb_samples = nb_samples;
-    if (nb_samples)
-    {
-      const auto ret = av_frame_get_buffer(frame, 0);
-      if (ret < 0)
-        throw std::runtime_error{"Error allocating an audio buffer"};
-    }
-    return frame;
-  }
-
-  static void open_audio(AVFormatContext*,
-                         const AVCodec* codec,
-                         OutputStream* ost,
-                         AVDictionary* opt_arg)
-  {
-    AVCodecContext* c;
-    int nb_samples;
-    int ret;
-    AVDictionary* opt = nullptr;
-    c = ost->encoding_context;
-    /* open it */
-    av_dict_copy(&opt, opt_arg, 0);
-    ret = avcodec_open2(c, codec, &opt);
-    av_dict_free(&opt);
-    if (ret < 0)
-      throw std::runtime_error{
-          format("Could not open audio codec: %s", av_err2str(ret))};
-
-    /* init signal generator */
-    ost->t = 0;
-    ost->tincr = static_cast<float>(2 * M_PI * 110.0 / c->sample_rate);
-    /* increment frequency by 110 Hz per second */
-    ost->tincr2 =
-        static_cast<float>(2 * M_PI * 110.0 / c->sample_rate / c->sample_rate);
-    if (c->codec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE)
-      nb_samples = 10000;
-    else
-      nb_samples = c->frame_size;
-    ost->frame = alloc_audio_frame(c->sample_fmt, c->channel_layout,
-                                   c->sample_rate, nb_samples);
-    ost->tmp_frame = alloc_audio_frame(AV_SAMPLE_FMT_S16, c->channel_layout,
-                                       c->sample_rate, nb_samples);
-    /* copy the stream parameters to the muxer */
-    ret = avcodec_parameters_from_context(ost->stream->codecpar, c);
-    if (ret < 0)
-      throw std::runtime_error{"Could not copy the stream parameters"};
-
-    /* create resampler context */
-    ost->swr_ctx = swr_alloc();
-    if (!ost->swr_ctx)
-      throw std::runtime_error{"Could not allocate resampler context"};
-
-    /* set options */
-    av_opt_set_int(ost->swr_ctx, "in_channel_count", c->channels, 0);
-    av_opt_set_int(ost->swr_ctx, "in_sample_rate", c->sample_rate, 0);
-    av_opt_set_sample_fmt(ost->swr_ctx, "in_sample_fmt", AV_SAMPLE_FMT_S16, 0);
-    av_opt_set_int(ost->swr_ctx, "out_channel_count", c->channels, 0);
-    av_opt_set_int(ost->swr_ctx, "out_sample_rate", c->sample_rate, 0);
-    av_opt_set_sample_fmt(ost->swr_ctx, "out_sample_fmt", c->sample_fmt, 0);
-    /* initialize the resampling context */
-    if ((ret = swr_init(ost->swr_ctx)) < 0)
-      throw std::runtime_error{"Failed to initialize the resampling context\n"};
-  }
-
-
-#if 0
-  /* Prepare a 16 bit dummy audio frame of 'frame_size' samples and
-   * 'nb_channels' channels. */
-  static AVFrame* get_audio_frame(OutputStream* ost)
-  {
-    AVFrame* frame = ost->tmp_frame;
-    int j, i, v;
-    int16_t* q = reinterpret_cast<int16_t*>(frame->data[0]);
-    /* check if we want to generate more frames */
-    if (av_compare_ts(ost->next_pts, ost->encoding_context->time_base,
-                      static_cast<std::int64_t>(STREAM_DURATION), {1, 1}) > 0)
-      return nullptr;
-    for (j = 0; j < frame->nb_samples; j++)
-    {
-      v = (int) (sin(ost->t) * 10000);
-      for (i = 0; i < ost->encoding_context->channels; i++)
-        *q++ = v;
-      ost->t += ost->tincr;
-      ost->tincr += ost->tincr2;
-    }
-    frame->pts = ost->next_pts;
-    ost->next_pts += frame->nb_samples;
-    return frame;
-  }
-
-  /*
-   * encode one audio frame and send it to the muxer
-   * return 1 when encoding is finished, 0 otherwise
-   */
-  static int write_audio_frame(AVFormatContext* oc, OutputStream* ost)
-  {
-    AVCodecContext* c;
-    AVFrame* frame;
-    int ret;
-    int dst_nb_samples;
-    c = ost->encoding_context;
-    frame = get_audio_frame(ost);
-    if (frame)
-    {
-      /* convert samples from native format to destination codec format, using
-       * the resampler */
-      /* compute destination number of samples */
-      dst_nb_samples = static_cast<int>(av_rescale_rnd(
-          swr_get_delay(ost->swr_ctx, c->sample_rate) + frame->nb_samples,
-          c->sample_rate, c->sample_rate, AV_ROUND_UP));
-      av_assert0(dst_nb_samples == frame->nb_samples);
-      /* when we pass a frame to the encoder, it may keep a reference to it
-       * internally;
-       * make sure we do not overwrite it here
-       */
-      ret = av_frame_make_writable(ost->frame);
-      if (ret < 0)
-        throw std::runtime_error{"Could not make frame writable!"};
-
-      /* convert to destination format */
-      ret = swr_convert(ost->swr_ctx, ost->frame->data, dst_nb_samples,
-                        (const uint8_t**) frame->data, frame->nb_samples);
-      if (ret < 0)
-        throw std::runtime_error{"Error while converting audio frame!"};
-
-      frame = ost->frame;
-      frame->pts =
-          av_rescale_q(ost->samples_count, {1, c->sample_rate}, c->time_base);
-      ost->samples_count += dst_nb_samples;
-    }
-    return write_frame(oc, c, ost->stream, frame);
-  }
-#endif
-
-
   // ======================================================================== //
   // Video output
   //
@@ -458,81 +224,6 @@ namespace DO::Sara {
       throw std::runtime_error{"Could not copy the stream parameters!"};
   }
 
-#if 0
-  /* Prepare a dummy image. */
-  static void fill_yuv_image(AVFrame* pict, int frame_index, int width,
-                             int height)
-  {
-    int x, y, i;
-    i = frame_index;
-    /* Y */
-    for (y = 0; y < height; y++)
-      for (x = 0; x < width; x++)
-        pict->data[0][y * pict->linesize[0] + x] = x + y + i * 3;
-    /* Cb and Cr */
-    for (y = 0; y < height / 2; y++)
-    {
-      for (x = 0; x < width / 2; x++)
-      {
-        pict->data[1][y * pict->linesize[1] + x] = 128 + y + i * 2;
-        pict->data[2][y * pict->linesize[2] + x] = 64 + x + i * 5;
-      }
-    }
-  }
-
-  static AVFrame* get_video_frame(OutputStream* ostream)
-  {
-    AVCodecContext* c = ostream->encoding_context;
-    /* check if we want to generate more frames */
-    if (av_compare_ts(ostream->next_pts, c->time_base,
-                      static_cast<std::int64_t>(STREAM_DURATION), {1, 1}) > 0)
-      return nullptr;
-    /* when we pass a frame to the encoder, it may keep a reference to it
-     * internally; make sure we do not overwrite it here */
-    if (av_frame_make_writable(ostream->frame) < 0)
-      throw std::runtime_error{"Could not make frame writable!"};
-
-    if (c->pix_fmt != AV_PIX_FMT_YUV420P)
-    {
-      /* as we only generate a YUV420P picture, we must convert it
-       * to the codec pixel format if needed */
-      if (!ostream->sws_ctx)
-      {
-        ostream->sws_ctx = sws_getContext(
-            c->width, c->height, AV_PIX_FMT_YUV420P, c->width, c->height,
-            c->pix_fmt, SCALE_FLAGS, nullptr, nullptr, nullptr);
-        if (!ostream->sws_ctx)
-          throw std::runtime_error{
-              "Could not initialize the conversion context"};
-      }
-      fill_yuv_image(ostream->tmp_frame, static_cast<int>(ostream->next_pts),
-                     c->width, c->height);
-      sws_scale(ostream->sws_ctx,
-                (const uint8_t* const*) ostream->tmp_frame->data,
-                ostream->tmp_frame->linesize, 0, c->height,
-                ostream->frame->data, ostream->frame->linesize);
-    }
-    else
-      fill_yuv_image(ostream->frame, static_cast<int>(ostream->next_pts),
-                     c->width, c->height);
-
-    ostream->frame->pts = ostream->next_pts++;
-
-    return ostream->frame;
-  }
-
-  /*
-   * encode one video frame and send it to the muxer
-   * return 1 when encoding is finished, 0 otherwise
-   */
-  static int write_video_frame(AVFormatContext* format_context,
-                               OutputStream* ostream)
-  {
-    return write_frame(format_context, ostream->encoding_context,
-                       ostream->stream, get_video_frame(ostream));
-  }
-#endif
-
   static void close_stream(AVFormatContext*, OutputStream* os)
   {
     avcodec_free_context(&os->encoding_context);
@@ -575,20 +266,11 @@ namespace DO::Sara {
       _have_video = 1;
       _encode_video = 1;
     }
-    if (_output_format->audio_codec != AV_CODEC_ID_NONE)
-    {
-      add_stream(&_audio_stream, _format_context, &_audio_codec,
-                 _output_format->audio_codec);
-      _have_audio = 1;
-      _encode_audio = 1;
-    }
 
     /* Now that all the parameters are set, we can open the audio and
      * video codecs and allocate the necessary encode buffers. */
     if (_have_video)
       open_video(_format_context, _video_codec, &_video_stream, _options);
-    if (_have_audio)
-      open_audio(_format_context, _audio_codec, &_audio_stream, _options);
     av_dump_format(_format_context, 0, filepath.c_str(), 1);
 
     auto ret = int{};
@@ -681,13 +363,6 @@ namespace DO::Sara {
       _have_video = 0;
       _video_stream = {};
     }
-    // Close the audio codec.
-    if (_have_audio)
-    {
-      close_stream(_format_context, &_audio_stream);
-      _have_audio = 0;
-      _audio_stream = {};
-    }
 
     // Close the output file.
     if (_output_format)
diff --git a/cpp/src/DO/Sara/VideoIO/VideoWriter.hpp b/cpp/src/DO/Sara/VideoIO/VideoWriter.hpp
index 7a3621e35..12fa8331c 100644
--- a/cpp/src/DO/Sara/VideoIO/VideoWriter.hpp
+++ b/cpp/src/DO/Sara/VideoIO/VideoWriter.hpp
@@ -67,13 +67,10 @@ namespace DO::Sara {
     OutputStream _audio_stream;
     const AVOutputFormat* _output_format = nullptr;
     AVFormatContext* _format_context;
-    const AVCodec* _audio_codec = nullptr;
     const AVCodec* _video_codec = nullptr;
     AVDictionary* _options = nullptr;
 
-    int _have_audio = 0;
     int _have_video = 0;
-    int _encode_audio = 0;
     int _encode_video = 0;
   };
 

From e502b4ef0ba08e98b779905b521aa6ec7de39843 Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Sun, 14 Apr 2024 20:48:36 +0100
Subject: [PATCH 32/49] WIP: add notes.

---
 cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp    |  18 ++--
 cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp  |  85 ++++++++-------
 cpp/src/DO/Sara/SfM/Graph/FeatureTracker.hpp  |   7 +-
 .../Sara/SfM/OdometryV2/OdometryPipeline.cpp  | 101 ++++++++++--------
 4 files changed, 114 insertions(+), 97 deletions(-)

diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp b/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp
index 6d76ccdbe..9fddb0292 100644
--- a/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/FeatureGraph.hpp
@@ -40,20 +40,23 @@ namespace DO::Sara {
   struct MatchGID
   {
     //! @brief Index of the epipolar edge connecting camera i and camera j.
-    CameraPoseGraph::Vertex i;
-    CameraPoseGraph::Vertex j;
+    CameraPoseGraph::Vertex pose_src;
+    CameraPoseGraph::Vertex pose_dst;
     //! @brief Local match index.
     std::size_t index;
 
     auto operator==(const MatchGID& other) const -> bool
     {
-      return i == other.i && j == other.j && index == other.index;
+      return pose_src == other.pose_src && pose_dst == other.pose_dst &&
+             index == other.index;
     }
 
     auto operator<(const MatchGID& other) const -> bool
     {
-      return (i < other.i) || (i == other.i && j < other.j) ||
-             (i == other.i && j == other.j && index < other.index);
+      return (pose_src < other.pose_src) ||
+             (pose_src == other.pose_src && pose_dst < other.pose_dst) ||
+             (pose_src == other.pose_src && pose_dst == other.pose_dst &&
+              index < other.index);
     }
   };
 
@@ -97,9 +100,8 @@ namespace DO::Sara {
     auto calculate_feature_tracks() const -> std::vector<Track>;
     auto filter_by_non_max_suppression(const Track&,
                                        const CameraPoseGraph&) const -> Track;
-    auto find_vertex_from_camera_view(const Track&,
-                                      const CameraPoseGraph::Vertex&) const
-        -> Vertex;
+    auto find_vertex_from_camera_view(
+        const Track&, const CameraPoseGraph::Vertex&) const -> Vertex;
 
   private:
     Impl _feature_graph;
diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp b/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp
index 497627f90..88095b0d3 100644
--- a/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp
+++ b/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp
@@ -19,75 +19,78 @@
 using namespace DO::Sara;
 
 auto FeatureTracker::update_feature_tracks(
-    const CameraPoseGraph& camera_pose_graph,
-    const CameraPoseGraph::Edge relative_pose_edge) -> void
+    const CameraPoseGraph& pose_graph,
+    const CameraPoseGraph::Edge pose_edge) -> void
 {
   auto& logger = Logger::get();
 
-  const CameraPoseGraph::Impl& cg = camera_pose_graph;
+  const CameraPoseGraph::Impl& pg = pose_graph;
   FeatureGraph::Impl& fg = _feature_graph;
 
-  // Retrieve the two camera vertices from the relative pose edge.
-  const auto pose_i = boost::source(relative_pose_edge, cg);
-  const auto pose_j = boost::target(relative_pose_edge, cg);
+  // Retrieve the camera poses from the relative pose edge.
+  const auto pose_u = boost::source(pose_edge, pg);
+  const auto pose_v = boost::target(pose_edge, pg);
   // The relative pose edge contains the set of all feature correspondences.
-  const auto& matches = cg[relative_pose_edge].matches;
+  const auto& matches = pg[pose_edge].matches;
   // Which of these feature correspondences are marked as inliers?
-  const auto& inliers = cg[relative_pose_edge].inliers;
+  const auto& inliers = pg[pose_edge].inliers;
 
-  // Loop over the feature correspondence and add the feature graph edges.
-  SARA_LOGD(logger, "Pose {} <-> Pose {}", pose_i, pose_j);
+  // Add the feature graph edges.
+  //
+  // They are feature matches that are deemed inliers according the relative
+  // pose estimation task.
+  SARA_LOGD(logger, "Pose {} <-> Pose {}", pose_u, pose_v);
   SARA_LOGD(logger, "Add feature correspondences...");
   for (auto m = 0u; m < matches.size(); ++m)
   {
     if (!inliers(m))
       continue;
 
+    // The feature match is 'm = (ix, iy)'
+    // where 'ix' and 'iy' are the local IDs of feature 'x' and 'y'.
     const auto& match = matches[m];
 
-    // Local feature indices.
-    const auto& f1 = match.x_index();
-    const auto& f2 = match.y_index();
-
-    // Create their corresponding feature GIDs.
-    const auto gid1 = FeatureGID{
-        .pose_vertex = pose_i,  //
-        .feature_index = f1     //
+    // 'x' and 'y' are respectively identified by their GID 'gid_x' and 'gid_y',
+    // which are defined as follows.
+    const auto gid_x = FeatureGID{
+        .pose_vertex = pose_u,            //
+        .feature_index = match.x_index()  //
     };
-    const auto gid2 = FeatureGID{
-        .pose_vertex = pose_j,  //
-        .feature_index = f2     //
+    const auto gid_y = FeatureGID{
+        .pose_vertex = pose_v,            //
+        .feature_index = match.y_index()  //
     };
 
-    // Locate their corresponding pair of vertices (u, v) in the graph?
-    // Do they exist yet in the first place?
-    const auto u_it = _feature_vertex.find(gid1);
-    const auto v_it = _feature_vertex.find(gid2);
+    // Are features 'x' and 'y' already added in the graph, i.e.,
+    // are vertex 'gid_x' and 'gid_y' already added in the graph?
+    const auto it_x = _feature_vertex.find(gid_x);
+    const auto it_y = _feature_vertex.find(gid_y);
 
-    const auto u_does_not_exist_yet = u_it == _feature_vertex.end();
-    const auto v_does_not_exist_yet = v_it == _feature_vertex.end();
+    const auto x_does_not_exist_yet = it_x == _feature_vertex.end();
+    const auto y_does_not_exist_yet = it_y == _feature_vertex.end();
 
     // If not, add them if necessary.
-    const auto u = u_does_not_exist_yet ? boost::add_vertex(fg) : u_it->second;
-    const auto v = v_does_not_exist_yet ? boost::add_vertex(fg) : v_it->second;
+    const auto x = x_does_not_exist_yet ? boost::add_vertex(fg) : it_x->second;
+    const auto y = y_does_not_exist_yet ? boost::add_vertex(fg) : it_y->second;
 
-    if (u_does_not_exist_yet)
+    if (x_does_not_exist_yet)
     {
-      fg[u] = gid1;
-      _feature_vertex[gid1] = u;
+      fg[x] = gid_x;
+      _feature_vertex[gid_x] = x;
     }
-    if (v_does_not_exist_yet)
+    if (y_does_not_exist_yet)
     {
-      fg[v] = gid2;
-      _feature_vertex[gid2] = v;
+      fg[y] = gid_y;
+      _feature_vertex[gid_y] = y;
     }
 
-    // Finally, store the feature match as an edge in the feature graph.
-    const auto [uv, uv_added] = boost::add_edge(u, v, fg);
-    auto& uv_attrs = fg[uv];
-    uv_attrs.i = boost::source(relative_pose_edge, cg);
-    uv_attrs.j = boost::target(relative_pose_edge, cg);
-    uv_attrs.index = m;
+    // Finally, store the feature match as an edge in the feature graph to
+    // navigate between the feature graph to the pose graph.
+    const auto [xy, xy_added] = boost::add_edge(x, y, fg);
+    auto& xy_attrs = fg[xy];
+    xy_attrs.pose_src = boost::source(pose_edge, pg);
+    xy_attrs.pose_dst = boost::target(pose_edge, pg);
+    xy_attrs.index = m;
   }
 
   // Update the feature disjoint-sets
diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.hpp b/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.hpp
index 45323fb72..9ad4da9be 100644
--- a/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.hpp
@@ -41,12 +41,11 @@ namespace DO::Sara {
     //! match index).
     std::map<MatchGID, FeatureGraph::Edge> _feature_match;
 
-    auto update_feature_tracks(  //
-        const CameraPoseGraph& camera_pose_graph,
-        const CameraPoseGraph::Edge relative_pose_edge_id) -> void;
+    auto update_feature_tracks(const CameraPoseGraph&,
+                               const CameraPoseGraph::Edge) -> void;
 
     auto calculate_alive_feature_tracks(
-        const CameraPoseGraph::Vertex camera_vertex_curr) const
+        const CameraPoseGraph::Vertex last_pose_vertex) const
         -> std::tuple<TrackArray, TrackVisibilityCountArray>;
   };
 
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
index 92006bce8..b62cbd2db 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
@@ -120,6 +120,7 @@ auto v2::OdometryPipeline::add_camera_pose() -> bool
   const auto frame_number = _video_streamer.frame_number();
   auto keys_curr = detect_keypoints(frame);
 
+  // Boundary case.
   if (_pose_graph.num_vertices() == 1)
   {
     // Initialize the new camera pose from the latest image frame.
@@ -133,52 +134,64 @@ auto v2::OdometryPipeline::add_camera_pose() -> bool
 
     return true;
   }
-  else
+
+  const auto& keys_prev = _pose_graph[_pose_prev].keypoints;
+  auto [rel_pose_data, two_view_geometry] =
+      estimate_relative_pose(keys_prev, keys_curr);
+  const auto num_inliers = rel_pose_data.inliers.flat_array().count();
+  SARA_LOGI(logger, "[SfM] Relative pose inliers: {} 3D points", num_inliers);
+  if (num_inliers < _feature_params.num_inliers_min)
   {
-    const auto& keys_prev = _pose_graph[_pose_prev].keypoints;
-    auto [rel_pose_data, two_view_geometry] =
-        estimate_relative_pose(keys_prev, keys_curr);
-    const auto num_inliers = rel_pose_data.inliers.flat_array().count();
-    SARA_LOGI(logger, "[SfM] Relative pose inliers: {} 3D points", num_inliers);
-    if (num_inliers < _feature_params.num_inliers_min)
-    {
-      SARA_LOGI(logger, "[SfM] Relative pose failed!");
-      return false;
-    }
-    SARA_LOGI(logger, "[SfM] Relative pose succeeded!");
-
-    if (_pose_graph.num_vertices() == 2)
-    {
-      auto abs_pose_curr = QuaternionBasedPose<double>{
-          .q = Eigen::Quaterniond{rel_pose_data.motion.R},
-          .t = rel_pose_data.motion.t  //
-      };
-
-      auto abs_pose_data = AbsolutePoseData{
-          frame_number,             //
-          std::move(keys_curr),     //
-          std::move(abs_pose_curr)  //
-      };
-
-      // 1. Add the absolute pose vertex.
-      _pose_graph.add_absolute_pose(std::move(abs_pose_data));
-
-      // 2. Add the pose edge, which will invalidate the relative pose data.
-      _pose_graph.add_relative_pose(_pose_prev, _pose_curr,
-                                    std::move(rel_pose_data));
-
-      // 3. TODO: Init point cloud
-
-      return true;
-    }
-    else
-    {
-      // 1. Add the absolute pose vertex.
-
-      // TODO: Grow point cloud by triangulation.
-      return false;
-    }
+    SARA_LOGI(logger, "[SfM] Relative pose failed!");
+    return false;
+  }
+  SARA_LOGI(logger, "[SfM] Relative pose succeeded!");
+
+  if (_pose_graph.num_vertices() == 2)
+  {
+    auto abs_pose_curr = QuaternionBasedPose<double>{
+        .q = Eigen::Quaterniond{rel_pose_data.motion.R},
+        .t = rel_pose_data.motion.t  //
+    };
+
+    auto abs_pose_data = AbsolutePoseData{
+        frame_number,             //
+        std::move(keys_curr),     //
+        std::move(abs_pose_curr)  //
+    };
+
+    // 1. Add the absolute pose vertex.
+    _pose_graph.add_absolute_pose(std::move(abs_pose_data));
+
+    // 2. Add the pose edge, which will invalidate the relative pose data.
+    const auto pose_edge = _pose_graph.add_relative_pose(
+        _pose_prev, _pose_curr, std::move(rel_pose_data));
+
+    // 3. Grow the feature graph by adding the feature matches.
+    _feature_tracker.update_feature_tracks(_pose_graph, pose_edge);
+
+    // 4. TODO: Init point cloud
+
+    // 5. TODO: don't add 3D scene points that are too far, like point in the
+    //    sky
+
+    return true;
   }
 
+  // 1. Grow the feature graph first by adding the feature matches that are
+  //    deemed reliable from the relative pose estimation.
+  // 2. Recalculate the feature tracks.
+  // 3. Get the feature tracks that are still alive.
+  // 4. For each feature track still alive, get the corresponding scene
+  //    points.
+  //    Each alive feature track still has the same old feature IDs in the
+  //    previous image frames, and we know their scene points.
+  //    Use triangulation computer vision task, to calculate the new camera
+  //    absolute pose.
+  // 5. With the camera absolute pose, add the new scene points.
+  //    Specifically, they are the alive feature tracks (with cardinality 2)
+  //    for which we don't know the scene points yet.
+
+  // TODO: Grow point cloud by triangulation.
   return false;
 }

From 3a5fb2078ebcbd79ccfe3fef214ae9f8acd29f2c Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Mon, 15 Apr 2024 01:03:31 +0100
Subject: [PATCH 33/49] WIP: save work.

---
 .../SfM/BuildingBlocks/BundleAdjuster.hpp     |  11 +
 .../BuildingBlocks/PointCloudManipulator.cpp  | 295 ++++++++++++++++++
 .../BuildingBlocks/PointCloudManipulator.hpp  |  71 ++++-
 .../SfM/BuildingBlocks/RgbColoredPoint.hpp    |  12 +-
 4 files changed, 372 insertions(+), 17 deletions(-)
 create mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.cpp

diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/BundleAdjuster.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/BundleAdjuster.hpp
index 2a0cdba0f..feeeeb003 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/BundleAdjuster.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/BundleAdjuster.hpp
@@ -1,3 +1,14 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024-present David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
 #pragma once
 
 #include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.cpp
new file mode 100644
index 000000000..c03a59629
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.cpp
@@ -0,0 +1,295 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024-present David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#include "DO/Sara/Core/Pixel/SmartColorConversion.hpp"
+#include <DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp>
+
+#include <DO/Sara/Logging/Logger.hpp>
+
+#include <unordered_set>
+
+
+using namespace DO::Sara;
+
+
+auto PointCloudManipulator::list_scene_point_indices(
+    const FeatureTrack& track) const -> std::vector<ScenePointIndex>
+{
+  auto index_set = std::unordered_set<ScenePointIndex>{};
+  for (const auto& v : track)
+  {
+    const auto scene_point_it = _from_vertex_to_scene_point_index.find(v);
+    if (scene_point_it != _from_vertex_to_scene_point_index.end())
+      index_set.emplace(scene_point_it->second);
+  }
+
+  const auto index_list = std::vector<ScenePointIndex>(  //
+      index_set.begin(), index_set.end());
+
+  return index_list;
+}
+
+auto PointCloudManipulator::filter_by_non_max_suppression(
+    const FeatureTrack& track) const -> FeatureTrack
+{
+  struct VertexScorePair
+  {
+    FeatureVertex vertex;
+    float score;
+    auto operator<(const VertexScorePair& other) const -> bool
+    {
+      return score < other.score;
+    }
+  };
+
+  auto filtered_set = std::unordered_map<PoseVertex, VertexScorePair>{};
+  for (const auto& v : track)
+  {
+    const auto& f = feature(v);
+    const auto& pose_vertex = _feature_graph[v].pose_vertex;
+    const auto pose_vertex_it = filtered_set.find(pose_vertex);
+    if (pose_vertex_it == filtered_set.end())
+    {
+      filtered_set[pose_vertex] = {.vertex = v, .score = f.extremum_value};
+      continue;
+    }
+
+    auto& vertex_score = pose_vertex_it->second;
+    if (vertex_score.score < f.extremum_value)
+      vertex_score = {.vertex = v, .score = f.extremum_value};
+  }
+
+  auto filtered_list = FeatureTrack(filtered_set.size());
+  std::transform(filtered_set.begin(), filtered_set.end(),
+                 filtered_list.begin(),
+                 [](const auto& v) { return v.second.vertex; });
+
+  // Order feature vertices in a chronological order.
+  //
+  // The camera vertex ID is incremented as time goes on and can be seen as a
+  // timestep.
+  std::sort(filtered_list.begin(), filtered_list.end(),
+            [this](const auto u, const auto v) {
+              return _feature_graph[u].pose_vertex <
+                     _feature_graph[v].pose_vertex;
+            });
+
+  return filtered_list;
+}
+
+auto PointCloudManipulator::find_feature_vertex_at_pose(
+    const FeatureTrack& track,
+    const PoseVertex pose_vertex) const -> std::optional<FeatureVertex>
+{
+  auto v = std::find_if(track.begin(), track.end(),
+                        [this, pose_vertex](const auto& v) {
+                          return this->gid(v).pose_vertex == pose_vertex;
+                        });
+  return v == track.end() ? std::nullopt : std::make_optional(*v);
+}
+
+
+auto PointCloudManipulator::barycenter(
+    const std::vector<ScenePointIndex>& scene_point_indices) const -> ScenePoint
+{
+  if (scene_point_indices.empty())
+    throw std::runtime_error{"Error: cannot calculate a barycentric scene "
+                             "point from an empty list of scene point indices"};
+  static const ScenePoint::Value zero = ScenePoint::Value::Zero();
+  auto bary = std::accumulate(  //
+      scene_point_indices.begin(), scene_point_indices.end(), zero,
+      [this](const ScenePoint::Value& a,
+             const ScenePointIndex bi) -> ScenePoint::Value {
+        const ScenePoint::Value& b = _point_cloud[bi];
+        return a + b;
+      });
+  bary /= scene_point_indices.size();
+
+  auto scene_point = ScenePoint{};
+  static_cast<ScenePoint::Value&>(scene_point) = bary;
+  return scene_point;
+}
+
+auto PointCloudManipulator::split_by_scene_point_knowledge(
+    const std::vector<FeatureTrack>& tracks) const
+    -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>
+{
+  auto& logger = Logger::get();
+
+  auto tracks_with_known_scene_point = std::vector<FeatureTrack>{};
+  auto tracks_with_unknown_scene_point = std::vector<FeatureTrack>{};
+  tracks_with_known_scene_point.reserve(tracks.size());
+  tracks_with_unknown_scene_point.reserve(tracks.size());
+
+  SARA_LOGD(logger, "Splitting feature tracks by knowledge of scene point...");
+
+  for (const auto& track : tracks)
+  {
+    const auto scene_point_indices = list_scene_point_indices(track);
+    if (scene_point_indices.empty())
+      tracks_with_unknown_scene_point.emplace_back(track);
+    else
+      tracks_with_known_scene_point.emplace_back(track);
+  }
+
+  SARA_LOGD(logger, "Tracks: {}", tracks.size());
+  SARA_LOGD(logger, "Tracks with known   scene point: {}",
+            tracks_with_known_scene_point.size());
+  SARA_LOGD(logger, "Tracks with unknown scene point: {}",
+            tracks_with_unknown_scene_point.size());
+
+  return std::make_pair(tracks_with_known_scene_point,
+                        tracks_with_unknown_scene_point);
+}
+
+auto PointCloudManipulator::retrieve_scene_point_color(
+    const Eigen::Vector3d& scene_point,  //
+    const ImageView<Rgb8>& image,        //
+    const QuaternionBasedPose<double>& pose,
+    const v2::BrownConradyDistortionModel<double>& camera) const -> Rgb64f
+{
+  const auto& w = image.width();
+  const auto& h = image.height();
+
+  // Its coordinates in the camera frame.
+  const auto camera_point = pose * scene_point;
+
+  // Its corresponding pixel coordinates in the image.
+  const Eigen::Vector2i u = camera
+                                .project(camera_point)  //
+                                .array()
+                                .round()
+                                .cast<int>();
+
+  // Clamp for safety
+  // TODO: do bilinear interpolation.
+  const auto x = std::clamp(u.x(), 0, w - 1);
+  const auto y = std::clamp(u.y(), 0, h - 1);
+
+  // N.B.: the image is an array of BGR values.
+  const auto& rgb8 = image(x, y);
+  // We store RGB values.
+  static constexpr auto normalization_factor = 1 / 255.;
+  const Rgb64f rgb64f = rgb8.cast<double>() * normalization_factor;
+
+  return rgb64f;
+}
+
+
+auto PointCloudManipulator::init_point_cloud(
+    const std::vector<FeatureTrack>& feature_tracks,
+    const ImageView<Rgb8>& image,  //
+    const PoseEdge pose_edge,
+    const v2::BrownConradyDistortionModel<double>& camera) -> void
+{
+  auto& logger = Logger::get();
+
+  SARA_LOGD(logger, "Transform feature tracks into best feature pairs...");
+  const auto& pose_u = boost::source(pose_edge, _pose_graph);
+  const auto& pose_v = boost::target(pose_edge, _pose_graph);
+  const auto& pose_data_u = _camera_pose_graph[pose_u].pose;
+  const auto& pose_data_v = _camera_pose_graph[pose_v].pose;
+  SARA_LOGD(logger, "Pose[from]:\n{}", pose_from.matrix34());
+  SARA_LOGD(logger, "Pose[to  ]:\n{}", pose_to.matrix34());
+
+#if 0
+  const auto num_feature_tracks =
+      static_cast<Eigen::Index>(feature_tracks.size());
+
+  using FeatureVertexPair = std::array<FeatureVertex, 2>;
+  auto best_feature_pairs = std::vector<FeatureVertexPair>(num_feature_tracks);
+  std::transform(
+      feature_tracks.begin(), feature_tracks.end(), best_feature_pairs.begin(),
+      [this, camera_from, camera_to](const auto& track) -> FeatureVertexPair {
+        // Non-max suppression.
+        const auto track_filtered = filter_by_non_max_suppression(track);
+        if (track_filtered.size() != 2)
+          throw std::runtime_error{"Error: the feature track filtered by NMS "
+                                   "is not a feature pair!"};
+
+        // Retrieve the cleaned up feature correspondence.
+        const auto v_from = find_vertex_at_camera_view(track_filtered,  //
+                                                       camera_from);
+        const auto v_to = find_vertex_at_camera_view(track_filtered,  //
+                                                     camera_to);
+        if (!v_from.has_value() || !v_to.has_value())
+          throw std::runtime_error{
+              "Error: the feature pair is not a valid feature correspondence!"};
+
+        return {*v_from, *v_to};
+      });
+
+  SARA_LOGD(logger, "Calculating ray pairs from feature pairs...");
+  auto rays_from = Eigen::MatrixXd{3, num_feature_tracks};
+  auto rays_to = Eigen::MatrixXd{3, num_feature_tracks};
+  for (auto t = 0u; t < num_feature_tracks; ++t)
+  {
+    const auto& feature_pair = best_feature_pairs[t];
+    const auto coords = std::array{pixel_coords(feature_pair[0]),
+                                   pixel_coords(feature_pair[1])};
+    rays_from.col(t) = camera.backproject(coords[0]);
+    rays_to.col(t) = camera.backproject(coords[1]);
+  }
+
+  // Calculate the associated triangulation.
+  SARA_LOGD(logger, "Initialization the point cloud by 3D triangulation from "
+                    "the relative pose...");
+  const auto& relative_pose_attr = _pose_graph[pose_edge];
+  if (!relative_pose_attr.relative_pose.has_value())
+    throw std::runtime_error{
+        "Error: tried triangulating but there is no relative pose!"};
+  const auto& motion = *relative_pose_attr.relative_pose;
+  if ((pose_to.matrix34() - motion.projection_matrix()).norm() > 1e-6)
+    throw std::runtime_error{
+        "Error: the absolute pose is not initialized from the relative pose!"};
+
+  const auto triangulation = triangulate_linear_eigen(  //
+      pose_from.matrix34(), pose_to.matrix34(),         //
+      rays_from, rays_to);
+
+  // Allocate the mapping from the feature vertices to the scene point index.
+  if (!_from_vertex_to_scene_point_index.empty())
+    _from_vertex_to_scene_point_index.clear();
+
+  // Calculate the initial point cloud.
+  if (!_point_cloud.empty())
+    _point_cloud.clear();
+
+  const auto& X = triangulation.X;
+  const auto& scales_from = triangulation.scales[0];
+  const auto& scales_to = triangulation.scales[1];
+
+  auto scene_point_index = scene_point_index_t{};
+  for (auto j = 0; j < X.cols(); ++j)
+  {
+    // Only consider **cheiral** inliers!
+    if (!(scales_from(j) > 0 && scales_to(j) > 0))
+      continue;
+
+    const Eigen::Vector3d scene_coords = X.col(j).hnormalized();
+    const auto rgb = retrieve_scene_point_color(scene_coords, image,  //
+                                                pose_to, camera);
+    const auto colored_point = (ColoredPoint{} << scene_coords, rgb).finished();
+
+    // Add a new point to the point cloud.
+    _point_cloud.emplace_back(colored_point);
+
+    const auto& [u, v] = best_feature_pairs[j];
+
+    // Assign a scene point index.
+    _from_vertex_to_scene_point_index[u] = scene_point_index;
+    _from_vertex_to_scene_point_index[v] = scene_point_index;
+    ++scene_point_index;
+  }
+
+  SARA_LOGD(logger, "point cloud: {} 3D points", _point_cloud.size());
+#endif
+}
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp
index 139dece09..d73575256 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp
@@ -1,8 +1,19 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024-present David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
 #pragma once
 
 #include <DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp>
 #include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
-#include <DO/Sara/SfM/Graph/FeatureGraph.hpp>
+#include <DO/Sara/SfM/Graph/FeatureTracker.hpp>
 
 
 namespace DO::Sara {
@@ -10,32 +21,70 @@ namespace DO::Sara {
   class PointCloudManipulator
   {
   public:
-    using scene_point_index_t = std::size_t;
+    using PoseVertex = CameraPoseGraph::Vertex;
+    using PoseEdge = CameraPoseGraph::Edge;
+    using FeatureVertex = FeatureGraph::Vertex;
+    using ScenePointIndex = std::size_t;
+    using ScenePoint = RgbColoredPoint<double>;
+
     using PointCloud = std::vector<RgbColoredPoint<double>>;
+    using FeatureTrack = FeatureTracker::Track;
 
     PointCloudManipulator(const CameraPoseGraph& camera_pose_graph,
                           const FeatureGraph& feature_graph,
                           PointCloud& point_cloud)
-      : _camera_pose_graph{camera_pose_graph}
+      : _pose_graph{camera_pose_graph}
       , _feature_graph{feature_graph}
       , _point_cloud{point_cloud}
     {
     }
 
-    auto grow_point_cloud(const std::vector<FeatureGraph::Track>&,
-                          const CameraPoseGraph::Edge&,  //
-                          const ImageView<Rgb8>&) -> void;
+    auto list_scene_point_indices(const FeatureTrack&) const
+        -> std::vector<ScenePointIndex>;
+
+    auto filter_by_non_max_suppression(const FeatureTrack&) const  //
+        -> FeatureTrack;
+
+    auto split_by_scene_point_knowledge(const std::vector<FeatureTrack>&) const
+        -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>;
+
+    auto
+    init_point_cloud(const std::vector<FeatureTrack>&,  //
+                     const ImageView<Rgb8>&,            //
+                     const PoseEdge,
+                     const v2::BrownConradyDistortionModel<double>&) -> void;
+
+  public: /* utility methods */
+    auto gid(const FeatureVertex u) const -> const FeatureGID&
+    {
+      return _feature_graph[u];
+    }
+
+    auto feature(const FeatureVertex u) const -> const OERegion&
+    {
+      const auto& [pose_vertex, feature_index] = gid(u);
+      const auto& f = features(_pose_graph[pose_vertex].keypoints);
+      return f[feature_index];
+    }
+
+    auto barycenter(const std::vector<ScenePointIndex>&) const -> ScenePoint;
+
+    auto find_feature_vertex_at_pose(const FeatureTrack&,  //
+                                     const PoseVertex) const
+        -> std::optional<FeatureVertex>;
 
-    auto compact_point_cloud(const std::vector<FeatureGraph::Track>&,
-                             const CameraPoseGraph::Edge&,
-                             const ImageView<Rgb8>&) -> void;
+    auto retrieve_scene_point_color(
+        const Eigen::Vector3d& scene_point,  //
+        const ImageView<Rgb8>& image,        //
+        const QuaternionBasedPose<double>& pose,
+        const v2::BrownConradyDistortionModel<double>& camera) const -> Rgb64f;
 
   private:
-    const CameraPoseGraph& _camera_pose_graph;
+    const CameraPoseGraph& _pose_graph;
     const FeatureGraph& _feature_graph;
     PointCloud& _point_cloud;
 
-    std::unordered_map<FeatureGraph::Vertex, scene_point_index_t>
+    std::unordered_map<FeatureVertex, ScenePointIndex>
         _from_vertex_to_scene_point_index;
   };
 
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp
index b393fe16f..8da8cecac 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp
@@ -12,32 +12,32 @@ namespace DO::Sara {
     using Coords = Eigen::Vector<T, 3>;
     using Color = Eigen::Vector<T, 3>;
 
-    operator Value&()
+    inline operator Value&()
     {
       return value;
     }
 
-    operator const Value&() const
+    inline operator const Value&() const
     {
       return value;
     }
 
-    auto coords() -> Eigen::Map<Coords>
+    inline auto coords() -> Eigen::Map<Coords>
     {
       return Eigen::Map<Eigen::Vector3<T>>{value.data()};
     }
 
-    auto coords() const -> Eigen::Map<const Coords>
+    inline auto coords() const -> Eigen::Map<const Coords>
     {
       return Eigen::Map<const Coords>{value.data()};
     }
 
-    auto color() -> Eigen::Map<Color>
+    inline auto color() -> Eigen::Map<Color>
     {
       return Eigen::Map<Color>{value.data() + 3};
     }
 
-    auto color() const -> Eigen::Map<const Color>
+    inline auto color() const -> Eigen::Map<const Color>
     {
       return Eigen::Map<const Color>{value.data() + 3};
     }

From f6e8fb8baf428c81b739b9da2d9fd01ebcc6cf56 Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Mon, 15 Apr 2024 13:12:34 +0100
Subject: [PATCH 34/49] WIP: pursue work on point cloud generation.

---
 .../Geometry/PinholeCamera.hpp                |   6 +-
 .../BuildingBlocks/PointCloudGenerator.cpp    |  97 ++++++
 ...anipulator.hpp => PointCloudGenerator.hpp} |   8 +-
 .../BuildingBlocks/PointCloudManipulator.cpp  | 295 ------------------
 .../SfM/BuildingBlocks/RgbColoredPoint.hpp    |  24 +-
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp |  11 +
 6 files changed, 131 insertions(+), 310 deletions(-)
 create mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
 rename cpp/src/DO/Sara/SfM/BuildingBlocks/{PointCloudManipulator.hpp => PointCloudGenerator.hpp} (93%)
 delete mode 100644 cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.cpp

diff --git a/cpp/src/DO/Sara/MultiViewGeometry/Geometry/PinholeCamera.hpp b/cpp/src/DO/Sara/MultiViewGeometry/Geometry/PinholeCamera.hpp
index 9ccc2b426..848013e01 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/Geometry/PinholeCamera.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/Geometry/PinholeCamera.hpp
@@ -37,9 +37,9 @@ namespace DO::Sara {
       return K * Rt;
     }
 
-    Eigen::Matrix3d K{Eigen::Matrix3d::Identity()};
-    Eigen::Matrix3d R{Eigen::Matrix3d::Identity()};
-    Eigen::Vector3d t{Eigen::Vector3d::Zero()};
+    Eigen::Matrix3d K = Eigen::Matrix3d::Identity();
+    Eigen::Matrix3d R = Eigen::Matrix3d::Identity();
+    Eigen::Vector3d t = Eigen::Vector3d::Zero();
   };
 
   inline auto normalized_camera(const Eigen::Matrix3d& R,
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
new file mode 100644
index 000000000..61d52ed06
--- /dev/null
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
@@ -0,0 +1,97 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024-present David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#pragma once
+
+#include <DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp>
+#include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
+#include <DO/Sara/SfM/Graph/FeatureTracker.hpp>
+
+
+namespace DO::Sara {
+
+  class PointCloudGenerator
+  {
+  public:
+    using PoseVertex = CameraPoseGraph::Vertex;
+    using PoseEdge = CameraPoseGraph::Edge;
+    using FeatureVertex = FeatureGraph::Vertex;
+    using ScenePointIndex = std::size_t;
+    using ScenePoint = RgbColoredPoint<double>;
+
+    using PointCloud = std::vector<RgbColoredPoint<double>>;
+    using FeatureTrack = FeatureTracker::Track;
+
+    PointCloudGenerator(const CameraPoseGraph& camera_pose_graph,
+                        const FeatureGraph& feature_graph,
+                        PointCloud& point_cloud)
+      : _pose_graph{camera_pose_graph}
+      , _feature_graph{feature_graph}
+      , _point_cloud{point_cloud}
+    {
+    }
+
+    auto list_scene_point_indices(const FeatureTrack&) const
+        -> std::vector<ScenePointIndex>;
+
+    auto filter_by_non_max_suppression(const FeatureTrack&) const  //
+        -> FeatureTrack;
+
+    auto split_by_scene_point_knowledge(const std::vector<FeatureTrack>&) const
+        -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>;
+
+    auto seed_point_cloud_from_two_views(
+        const std::vector<FeatureTrack>&,  //
+        const ImageView<Rgb8>&,            //
+        const PoseEdge,                    //
+        const v2::BrownConradyDistortionModel<double>&) -> void;
+
+  public: /* utility methods */
+    auto gid(const FeatureVertex u) const -> const FeatureGID&
+    {
+      return _feature_graph[u];
+    }
+
+    auto feature(const FeatureVertex u) const -> const OERegion&
+    {
+      const auto& [pose_vertex, feature_index] = gid(u);
+      const auto& f = features(_pose_graph[pose_vertex].keypoints);
+      return f[feature_index];
+    }
+
+    auto pixel_coords(const FeatureVertex u) const -> const Eigen::Vector2f&
+    {
+      return feature(u).center();
+    }
+
+    auto barycenter(const std::vector<ScenePointIndex>&) const -> ScenePoint;
+
+    auto find_feature_vertex_at_pose(const FeatureTrack&,  //
+                                     const PoseVertex) const
+        -> std::optional<FeatureVertex>;
+
+    auto retrieve_scene_point_color(
+        const ScenePoint::Coords& scene_point_coords,      //
+        const ImageView<Rgb8>& image,                      //
+        const QuaternionBasedPose<double>& absolute_pose,  //
+        const v2::BrownConradyDistortionModel<double>& camera) const
+        -> ScenePoint::Color;
+
+  private:
+    const CameraPoseGraph& _pose_graph;
+    const FeatureGraph& _feature_graph;
+    PointCloud& _point_cloud;
+
+    std::unordered_map<FeatureVertex, ScenePointIndex>
+        _from_vertex_to_scene_point_index;
+  };
+
+}  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
similarity index 93%
rename from cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp
rename to cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
index d73575256..2ce69a063 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
@@ -18,7 +18,7 @@
 
 namespace DO::Sara {
 
-  class PointCloudManipulator
+  class PointCloudGenerator
   {
   public:
     using PoseVertex = CameraPoseGraph::Vertex;
@@ -30,9 +30,9 @@ namespace DO::Sara {
     using PointCloud = std::vector<RgbColoredPoint<double>>;
     using FeatureTrack = FeatureTracker::Track;
 
-    PointCloudManipulator(const CameraPoseGraph& camera_pose_graph,
-                          const FeatureGraph& feature_graph,
-                          PointCloud& point_cloud)
+    PointCloudGenerator(const CameraPoseGraph& camera_pose_graph,
+                        const FeatureGraph& feature_graph,
+                        PointCloud& point_cloud)
       : _pose_graph{camera_pose_graph}
       , _feature_graph{feature_graph}
       , _point_cloud{point_cloud}
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.cpp
deleted file mode 100644
index c03a59629..000000000
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.cpp
+++ /dev/null
@@ -1,295 +0,0 @@
-// ========================================================================== //
-// This file is part of Sara, a basic set of libraries in C++ for computer
-// vision.
-//
-// Copyright (C) 2024-present David Ok <david.ok8@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License v. 2.0. If a copy of the MPL was not distributed with this file,
-// you can obtain one at http://mozilla.org/MPL/2.0/.
-// ========================================================================== //
-
-#include "DO/Sara/Core/Pixel/SmartColorConversion.hpp"
-#include <DO/Sara/SfM/BuildingBlocks/PointCloudManipulator.hpp>
-
-#include <DO/Sara/Logging/Logger.hpp>
-
-#include <unordered_set>
-
-
-using namespace DO::Sara;
-
-
-auto PointCloudManipulator::list_scene_point_indices(
-    const FeatureTrack& track) const -> std::vector<ScenePointIndex>
-{
-  auto index_set = std::unordered_set<ScenePointIndex>{};
-  for (const auto& v : track)
-  {
-    const auto scene_point_it = _from_vertex_to_scene_point_index.find(v);
-    if (scene_point_it != _from_vertex_to_scene_point_index.end())
-      index_set.emplace(scene_point_it->second);
-  }
-
-  const auto index_list = std::vector<ScenePointIndex>(  //
-      index_set.begin(), index_set.end());
-
-  return index_list;
-}
-
-auto PointCloudManipulator::filter_by_non_max_suppression(
-    const FeatureTrack& track) const -> FeatureTrack
-{
-  struct VertexScorePair
-  {
-    FeatureVertex vertex;
-    float score;
-    auto operator<(const VertexScorePair& other) const -> bool
-    {
-      return score < other.score;
-    }
-  };
-
-  auto filtered_set = std::unordered_map<PoseVertex, VertexScorePair>{};
-  for (const auto& v : track)
-  {
-    const auto& f = feature(v);
-    const auto& pose_vertex = _feature_graph[v].pose_vertex;
-    const auto pose_vertex_it = filtered_set.find(pose_vertex);
-    if (pose_vertex_it == filtered_set.end())
-    {
-      filtered_set[pose_vertex] = {.vertex = v, .score = f.extremum_value};
-      continue;
-    }
-
-    auto& vertex_score = pose_vertex_it->second;
-    if (vertex_score.score < f.extremum_value)
-      vertex_score = {.vertex = v, .score = f.extremum_value};
-  }
-
-  auto filtered_list = FeatureTrack(filtered_set.size());
-  std::transform(filtered_set.begin(), filtered_set.end(),
-                 filtered_list.begin(),
-                 [](const auto& v) { return v.second.vertex; });
-
-  // Order feature vertices in a chronological order.
-  //
-  // The camera vertex ID is incremented as time goes on and can be seen as a
-  // timestep.
-  std::sort(filtered_list.begin(), filtered_list.end(),
-            [this](const auto u, const auto v) {
-              return _feature_graph[u].pose_vertex <
-                     _feature_graph[v].pose_vertex;
-            });
-
-  return filtered_list;
-}
-
-auto PointCloudManipulator::find_feature_vertex_at_pose(
-    const FeatureTrack& track,
-    const PoseVertex pose_vertex) const -> std::optional<FeatureVertex>
-{
-  auto v = std::find_if(track.begin(), track.end(),
-                        [this, pose_vertex](const auto& v) {
-                          return this->gid(v).pose_vertex == pose_vertex;
-                        });
-  return v == track.end() ? std::nullopt : std::make_optional(*v);
-}
-
-
-auto PointCloudManipulator::barycenter(
-    const std::vector<ScenePointIndex>& scene_point_indices) const -> ScenePoint
-{
-  if (scene_point_indices.empty())
-    throw std::runtime_error{"Error: cannot calculate a barycentric scene "
-                             "point from an empty list of scene point indices"};
-  static const ScenePoint::Value zero = ScenePoint::Value::Zero();
-  auto bary = std::accumulate(  //
-      scene_point_indices.begin(), scene_point_indices.end(), zero,
-      [this](const ScenePoint::Value& a,
-             const ScenePointIndex bi) -> ScenePoint::Value {
-        const ScenePoint::Value& b = _point_cloud[bi];
-        return a + b;
-      });
-  bary /= scene_point_indices.size();
-
-  auto scene_point = ScenePoint{};
-  static_cast<ScenePoint::Value&>(scene_point) = bary;
-  return scene_point;
-}
-
-auto PointCloudManipulator::split_by_scene_point_knowledge(
-    const std::vector<FeatureTrack>& tracks) const
-    -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>
-{
-  auto& logger = Logger::get();
-
-  auto tracks_with_known_scene_point = std::vector<FeatureTrack>{};
-  auto tracks_with_unknown_scene_point = std::vector<FeatureTrack>{};
-  tracks_with_known_scene_point.reserve(tracks.size());
-  tracks_with_unknown_scene_point.reserve(tracks.size());
-
-  SARA_LOGD(logger, "Splitting feature tracks by knowledge of scene point...");
-
-  for (const auto& track : tracks)
-  {
-    const auto scene_point_indices = list_scene_point_indices(track);
-    if (scene_point_indices.empty())
-      tracks_with_unknown_scene_point.emplace_back(track);
-    else
-      tracks_with_known_scene_point.emplace_back(track);
-  }
-
-  SARA_LOGD(logger, "Tracks: {}", tracks.size());
-  SARA_LOGD(logger, "Tracks with known   scene point: {}",
-            tracks_with_known_scene_point.size());
-  SARA_LOGD(logger, "Tracks with unknown scene point: {}",
-            tracks_with_unknown_scene_point.size());
-
-  return std::make_pair(tracks_with_known_scene_point,
-                        tracks_with_unknown_scene_point);
-}
-
-auto PointCloudManipulator::retrieve_scene_point_color(
-    const Eigen::Vector3d& scene_point,  //
-    const ImageView<Rgb8>& image,        //
-    const QuaternionBasedPose<double>& pose,
-    const v2::BrownConradyDistortionModel<double>& camera) const -> Rgb64f
-{
-  const auto& w = image.width();
-  const auto& h = image.height();
-
-  // Its coordinates in the camera frame.
-  const auto camera_point = pose * scene_point;
-
-  // Its corresponding pixel coordinates in the image.
-  const Eigen::Vector2i u = camera
-                                .project(camera_point)  //
-                                .array()
-                                .round()
-                                .cast<int>();
-
-  // Clamp for safety
-  // TODO: do bilinear interpolation.
-  const auto x = std::clamp(u.x(), 0, w - 1);
-  const auto y = std::clamp(u.y(), 0, h - 1);
-
-  // N.B.: the image is an array of BGR values.
-  const auto& rgb8 = image(x, y);
-  // We store RGB values.
-  static constexpr auto normalization_factor = 1 / 255.;
-  const Rgb64f rgb64f = rgb8.cast<double>() * normalization_factor;
-
-  return rgb64f;
-}
-
-
-auto PointCloudManipulator::init_point_cloud(
-    const std::vector<FeatureTrack>& feature_tracks,
-    const ImageView<Rgb8>& image,  //
-    const PoseEdge pose_edge,
-    const v2::BrownConradyDistortionModel<double>& camera) -> void
-{
-  auto& logger = Logger::get();
-
-  SARA_LOGD(logger, "Transform feature tracks into best feature pairs...");
-  const auto& pose_u = boost::source(pose_edge, _pose_graph);
-  const auto& pose_v = boost::target(pose_edge, _pose_graph);
-  const auto& pose_data_u = _camera_pose_graph[pose_u].pose;
-  const auto& pose_data_v = _camera_pose_graph[pose_v].pose;
-  SARA_LOGD(logger, "Pose[from]:\n{}", pose_from.matrix34());
-  SARA_LOGD(logger, "Pose[to  ]:\n{}", pose_to.matrix34());
-
-#if 0
-  const auto num_feature_tracks =
-      static_cast<Eigen::Index>(feature_tracks.size());
-
-  using FeatureVertexPair = std::array<FeatureVertex, 2>;
-  auto best_feature_pairs = std::vector<FeatureVertexPair>(num_feature_tracks);
-  std::transform(
-      feature_tracks.begin(), feature_tracks.end(), best_feature_pairs.begin(),
-      [this, camera_from, camera_to](const auto& track) -> FeatureVertexPair {
-        // Non-max suppression.
-        const auto track_filtered = filter_by_non_max_suppression(track);
-        if (track_filtered.size() != 2)
-          throw std::runtime_error{"Error: the feature track filtered by NMS "
-                                   "is not a feature pair!"};
-
-        // Retrieve the cleaned up feature correspondence.
-        const auto v_from = find_vertex_at_camera_view(track_filtered,  //
-                                                       camera_from);
-        const auto v_to = find_vertex_at_camera_view(track_filtered,  //
-                                                     camera_to);
-        if (!v_from.has_value() || !v_to.has_value())
-          throw std::runtime_error{
-              "Error: the feature pair is not a valid feature correspondence!"};
-
-        return {*v_from, *v_to};
-      });
-
-  SARA_LOGD(logger, "Calculating ray pairs from feature pairs...");
-  auto rays_from = Eigen::MatrixXd{3, num_feature_tracks};
-  auto rays_to = Eigen::MatrixXd{3, num_feature_tracks};
-  for (auto t = 0u; t < num_feature_tracks; ++t)
-  {
-    const auto& feature_pair = best_feature_pairs[t];
-    const auto coords = std::array{pixel_coords(feature_pair[0]),
-                                   pixel_coords(feature_pair[1])};
-    rays_from.col(t) = camera.backproject(coords[0]);
-    rays_to.col(t) = camera.backproject(coords[1]);
-  }
-
-  // Calculate the associated triangulation.
-  SARA_LOGD(logger, "Initialization the point cloud by 3D triangulation from "
-                    "the relative pose...");
-  const auto& relative_pose_attr = _pose_graph[pose_edge];
-  if (!relative_pose_attr.relative_pose.has_value())
-    throw std::runtime_error{
-        "Error: tried triangulating but there is no relative pose!"};
-  const auto& motion = *relative_pose_attr.relative_pose;
-  if ((pose_to.matrix34() - motion.projection_matrix()).norm() > 1e-6)
-    throw std::runtime_error{
-        "Error: the absolute pose is not initialized from the relative pose!"};
-
-  const auto triangulation = triangulate_linear_eigen(  //
-      pose_from.matrix34(), pose_to.matrix34(),         //
-      rays_from, rays_to);
-
-  // Allocate the mapping from the feature vertices to the scene point index.
-  if (!_from_vertex_to_scene_point_index.empty())
-    _from_vertex_to_scene_point_index.clear();
-
-  // Calculate the initial point cloud.
-  if (!_point_cloud.empty())
-    _point_cloud.clear();
-
-  const auto& X = triangulation.X;
-  const auto& scales_from = triangulation.scales[0];
-  const auto& scales_to = triangulation.scales[1];
-
-  auto scene_point_index = scene_point_index_t{};
-  for (auto j = 0; j < X.cols(); ++j)
-  {
-    // Only consider **cheiral** inliers!
-    if (!(scales_from(j) > 0 && scales_to(j) > 0))
-      continue;
-
-    const Eigen::Vector3d scene_coords = X.col(j).hnormalized();
-    const auto rgb = retrieve_scene_point_color(scene_coords, image,  //
-                                                pose_to, camera);
-    const auto colored_point = (ColoredPoint{} << scene_coords, rgb).finished();
-
-    // Add a new point to the point cloud.
-    _point_cloud.emplace_back(colored_point);
-
-    const auto& [u, v] = best_feature_pairs[j];
-
-    // Assign a scene point index.
-    _from_vertex_to_scene_point_index[u] = scene_point_index;
-    _from_vertex_to_scene_point_index[v] = scene_point_index;
-    ++scene_point_index;
-  }
-
-  SARA_LOGD(logger, "point cloud: {} 3D points", _point_cloud.size());
-#endif
-}
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp
index 8da8cecac..8e94875ab 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp
@@ -6,44 +6,52 @@
 namespace DO::Sara {
 
   template <typename T>
-  struct RgbColoredPoint
+  class RgbColoredPoint
   {
+  public:
     using Value = Eigen::Vector<T, 6>;
     using Coords = Eigen::Vector<T, 3>;
     using Color = Eigen::Vector<T, 3>;
 
+    RgbColoredPoint() = default;
+
+    RgbColoredPoint(const Value& v)
+      : _v{v}
+    {
+    }
+
     inline operator Value&()
     {
-      return value;
+      return _v;
     }
 
     inline operator const Value&() const
     {
-      return value;
+      return _v;
     }
 
     inline auto coords() -> Eigen::Map<Coords>
     {
-      return Eigen::Map<Eigen::Vector3<T>>{value.data()};
+      return Eigen::Map<Eigen::Vector3<T>>{_v.data()};
     }
 
     inline auto coords() const -> Eigen::Map<const Coords>
     {
-      return Eigen::Map<const Coords>{value.data()};
+      return Eigen::Map<const Coords>{_v.data()};
     }
 
     inline auto color() -> Eigen::Map<Color>
     {
-      return Eigen::Map<Color>{value.data() + 3};
+      return Eigen::Map<Color>{_v.data() + 3};
     }
 
     inline auto color() const -> Eigen::Map<const Color>
     {
-      return Eigen::Map<const Color>{value.data() + 3};
+      return Eigen::Map<const Color>{_v.data() + 3};
     }
 
   private:
-    Value value;
+    Value _v;
   };
 
 }  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
index 5ba8584ce..982c5b0cb 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.hpp
@@ -99,6 +99,17 @@ namespace DO::Sara {
       return _g[e];
     }
 
+    auto source(const Edge e) const -> Vertex
+    {
+      return boost::source(e, _g);
+    }
+
+    auto target(const Edge e) const -> Vertex
+    {
+      return boost::target(e, _g);
+    }
+
+
     auto edge(const Vertex u, const Vertex v) const -> std::pair<Edge, bool>
     {
       return boost::edge(u, v, _g);

From 9a8ad445f714295e762c42109e3634c2cd603e88 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Mon, 15 Apr 2024 16:01:01 +0100
Subject: [PATCH 35/49] WIP: save work.

---
 .../visual_odometry_example_v2.cpp            |  2 +-
 .../BuildingBlocks/PointCloudGenerator.hpp    | 15 +++++------
 .../Sara/SfM/OdometryV2/OdometryPipeline.cpp  | 16 ++++++++----
 .../Sara/SfM/OdometryV2/OdometryPipeline.hpp  | 25 +++++++++++--------
 4 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp b/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
index 1c7f50309..4d4d24aa0 100644
--- a/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
+++ b/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
@@ -359,7 +359,7 @@ auto main(int const argc, char** const argv) -> int
     // clang-format off
     camera.k() <<
       -0.2338367557617234,
-      0.05952465745165465,
+      +0.05952465745165465,
       -0.007947847982157091;
     // clang-format on
     camera.p() << -0.0003137658969742134, 0.00021943576376532096;
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
index 2ce69a063..910006d0c 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
@@ -29,6 +29,8 @@ namespace DO::Sara {
 
     using PointCloud = std::vector<RgbColoredPoint<double>>;
     using FeatureTrack = FeatureTracker::Track;
+    using FeatureToScenePointMap = std::unordered_map<FeatureVertex,  //
+                                                      ScenePointIndex>;
 
     PointCloudGenerator(const CameraPoseGraph& camera_pose_graph,
                         const FeatureGraph& feature_graph,
@@ -48,11 +50,11 @@ namespace DO::Sara {
     auto split_by_scene_point_knowledge(const std::vector<FeatureTrack>&) const
         -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>;
 
-    auto
-    init_point_cloud(const std::vector<FeatureTrack>&,  //
-                     const ImageView<Rgb8>&,            //
-                     const PoseEdge,
-                     const v2::BrownConradyDistortionModel<double>&) -> void;
+    auto init_point_cloud(const std::vector<FeatureTrack>&,  //
+                          const ImageView<Rgb8>&,            //
+                          const PoseEdge,
+                          const v2::BrownConradyDistortionModel<double>&)
+        -> void;
 
   public: /* utility methods */
     auto gid(const FeatureVertex u) const -> const FeatureGID&
@@ -84,8 +86,7 @@ namespace DO::Sara {
     const FeatureGraph& _feature_graph;
     PointCloud& _point_cloud;
 
-    std::unordered_map<FeatureVertex, ScenePointIndex>
-        _from_vertex_to_scene_point_index;
+    FeatureToScenePointMap _from_vertex_to_scene_point_index;
   };
 
 }  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
index b62cbd2db..5cc32068a 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
@@ -38,6 +38,8 @@ auto v2::OdometryPipeline::set_config(
       _camera                           //
   );
   _relative_pose_estimator.configure(_camera);
+  _point_cloud_generator = std::make_unique<PointCloudGenerator>(
+      _pose_graph, _feature_tracker._feature_graph, _point_cloud);
 }
 
 auto v2::OdometryPipeline::read() -> bool
@@ -64,8 +66,10 @@ auto v2::OdometryPipeline::detect_keypoints(const ImageView<float>& image) const
     -> KeypointList<OERegion, float>
 {
   auto& logger = Logger::get();
-  SARA_LOGI(logger, "[Feature Detection] Matching image keypoints...");
-  return compute_sift_keypoints(image, _feature_params.image_pyr_params);
+  const auto keys = compute_sift_keypoints(image,  //
+                                           _feature_params.image_pyr_params);
+  SARA_LOGI(logger, "[Feature Detection] {} keypoints", features(keys).size());
+  return keys;
 }
 
 auto v2::OdometryPipeline::estimate_relative_pose(
@@ -120,8 +124,8 @@ auto v2::OdometryPipeline::add_camera_pose() -> bool
   const auto frame_number = _video_streamer.frame_number();
   auto keys_curr = detect_keypoints(frame);
 
-  // Boundary case.
-  if (_pose_graph.num_vertices() == 1)
+  // Boundary case: the graphs are empty.
+  if (_pose_graph.num_vertices() == 0)
   {
     // Initialize the new camera pose from the latest image frame.
     auto abs_pose_curr = QuaternionBasedPose<double>::identity();
@@ -147,7 +151,7 @@ auto v2::OdometryPipeline::add_camera_pose() -> bool
   }
   SARA_LOGI(logger, "[SfM] Relative pose succeeded!");
 
-  if (_pose_graph.num_vertices() == 2)
+  if (_pose_graph.num_vertices() == 1)
   {
     auto abs_pose_curr = QuaternionBasedPose<double>{
         .q = Eigen::Quaterniond{rel_pose_data.motion.R},
@@ -169,6 +173,8 @@ auto v2::OdometryPipeline::add_camera_pose() -> bool
 
     // 3. Grow the feature graph by adding the feature matches.
     _feature_tracker.update_feature_tracks(_pose_graph, pose_edge);
+    std::tie(_tracks_alive, _track_visibility_count) =
+        _feature_tracker.calculate_alive_feature_tracks(_pose_curr);
 
     // 4. TODO: Init point cloud
 
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
index acead5370..90c6a2ab3 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
@@ -12,21 +12,21 @@
 #pragma once
 
 #include <DO/Sara/Features/KeypointList.hpp>
-#include <DO/Sara/SfM/Odometry/ImageDistortionCorrector.hpp>
-#include <DO/Sara/SfM/Odometry/VideoStreamer.hpp>
-
+#include <DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp>
 #include <DO/Sara/SfM/BuildingBlocks/RelativePoseEstimator.hpp>
 #include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
 #include <DO/Sara/SfM/Graph/FeatureTracker.hpp>
+#include <DO/Sara/SfM/Odometry/ImageDistortionCorrector.hpp>
+#include <DO/Sara/SfM/Odometry/VideoStreamer.hpp>
 
 namespace DO::Sara::v2 {
 
   class OdometryPipeline
   {
   public:
-    auto
-    set_config(const std::filesystem::path& video_path,
-               const v2::BrownConradyDistortionModel<double>& camera) -> void;
+    auto set_config(const std::filesystem::path& video_path,
+                    const v2::BrownConradyDistortionModel<double>& camera)
+        -> void;
 
     auto read() -> bool;
 
@@ -41,9 +41,10 @@ namespace DO::Sara::v2 {
     auto detect_keypoints(const ImageView<float>&) const
         -> KeypointList<OERegion, float>;
 
-    auto estimate_relative_pose(const KeypointList<OERegion, float>& keys_src,
-                                const KeypointList<OERegion, float>& keys_dst)
-        const -> std::pair<RelativePoseData, TwoViewGeometry>;
+    auto
+    estimate_relative_pose(const KeypointList<OERegion, float>& keys_src,
+                           const KeypointList<OERegion, float>& keys_dst) const
+        -> std::pair<RelativePoseData, TwoViewGeometry>;
 
   private: /* graph update tasks */
     auto add_camera_pose() -> bool;
@@ -52,15 +53,19 @@ namespace DO::Sara::v2 {
     VideoStreamer _video_streamer;
     v2::BrownConradyDistortionModel<double> _camera;
 
+    //! @brief Data mutators.
+    //! @{
     std::unique_ptr<ImageDistortionCorrector> _distortion_corrector;
     v2::RelativePoseEstimator _relative_pose_estimator;
-
+    std::unique_ptr<PointCloudGenerator> _point_cloud_generator;
+    //! @}
 
     //! @brief SfM data.
     //! @{
     FeatureParams _feature_params;
     FeatureTracker _feature_tracker;
     CameraPoseGraph _pose_graph;
+    PointCloudGenerator::PointCloud _point_cloud;
     //! @}
 
     //! @brief SfM state.

From ca520e5efd89840efb41a7920573ddb7451480db Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Mon, 15 Apr 2024 17:10:15 +0100
Subject: [PATCH 36/49] WIP: salvage lost work.

---
 .../BuildingBlocks/PointCloudGenerator.cpp    | 326 ++++++++++++++----
 .../BuildingBlocks/PointCloudGenerator.hpp    |   7 +-
 2 files changed, 266 insertions(+), 67 deletions(-)

diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
index 61d52ed06..836ebe003 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
@@ -9,89 +9,283 @@
 // you can obtain one at http://mozilla.org/MPL/2.0/.
 // ========================================================================== //
 
-#pragma once
+#include <DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp>
 
-#include <DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp>
-#include <DO/Sara/SfM/Graph/CameraPoseGraph.hpp>
-#include <DO/Sara/SfM/Graph/FeatureTracker.hpp>
+#include <DO/Sara/Logging/Logger.hpp>
 
 
-namespace DO::Sara {
+using namespace DO::Sara;
 
-  class PointCloudGenerator
-  {
-  public:
-    using PoseVertex = CameraPoseGraph::Vertex;
-    using PoseEdge = CameraPoseGraph::Edge;
-    using FeatureVertex = FeatureGraph::Vertex;
-    using ScenePointIndex = std::size_t;
-    using ScenePoint = RgbColoredPoint<double>;
-
-    using PointCloud = std::vector<RgbColoredPoint<double>>;
-    using FeatureTrack = FeatureTracker::Track;
-
-    PointCloudGenerator(const CameraPoseGraph& camera_pose_graph,
-                        const FeatureGraph& feature_graph,
-                        PointCloud& point_cloud)
-      : _pose_graph{camera_pose_graph}
-      , _feature_graph{feature_graph}
-      , _point_cloud{point_cloud}
-    {
-    }
 
-    auto list_scene_point_indices(const FeatureTrack&) const
-        -> std::vector<ScenePointIndex>;
-
-    auto filter_by_non_max_suppression(const FeatureTrack&) const  //
-        -> FeatureTrack;
+auto PointCloudGenerator::list_scene_point_indices(
+    const FeatureTrack& track) const -> std::vector<ScenePointIndex>
+{
+  auto index_set = std::unordered_set<ScenePointIndex>{};
+  for (const auto& v : track)
+  {
+    const auto scene_point_it = _from_vertex_to_scene_point_index.find(v);
+    if (scene_point_it != _from_vertex_to_scene_point_index.end())
+      index_set.emplace(scene_point_it->second);
+  }
 
-    auto split_by_scene_point_knowledge(const std::vector<FeatureTrack>&) const
-        -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>;
+  const auto index_list = std::vector<ScenePointIndex>(  //
+      index_set.begin(), index_set.end());
 
-    auto seed_point_cloud_from_two_views(
-        const std::vector<FeatureTrack>&,  //
-        const ImageView<Rgb8>&,            //
-        const PoseEdge,                    //
-        const v2::BrownConradyDistortionModel<double>&) -> void;
+  return index_list;
+}
 
-  public: /* utility methods */
-    auto gid(const FeatureVertex u) const -> const FeatureGID&
+auto PointCloudGenerator::filter_by_non_max_suppression(
+    const FeatureTrack& track) const -> FeatureTrack
+{
+  struct VertexScorePair
+  {
+    FeatureVertex vertex;
+    float score;
+    auto operator<(const VertexScorePair& other) const -> bool
     {
-      return _feature_graph[u];
+      return score < other.score;
     }
+  };
 
-    auto feature(const FeatureVertex u) const -> const OERegion&
+  auto filtered_set = std::unordered_map<PoseVertex, VertexScorePair>{};
+  for (const auto& v : track)
+  {
+    const auto& f = feature(v);
+    const auto& pose_vertex = _feature_graph[v].pose_vertex;
+    const auto pose_vertex_it = filtered_set.find(pose_vertex);
+    if (pose_vertex_it == filtered_set.end())
     {
-      const auto& [pose_vertex, feature_index] = gid(u);
-      const auto& f = features(_pose_graph[pose_vertex].keypoints);
-      return f[feature_index];
+      filtered_set[pose_vertex] = {.vertex = v, .score = f.extremum_value};
+      continue;
     }
 
-    auto pixel_coords(const FeatureVertex u) const -> const Eigen::Vector2f&
-    {
-      return feature(u).center();
-    }
+    auto& vertex_score = pose_vertex_it->second;
+    if (vertex_score.score < f.extremum_value)
+      vertex_score = {.vertex = v, .score = f.extremum_value};
+  }
 
-    auto barycenter(const std::vector<ScenePointIndex>&) const -> ScenePoint;
+  auto filtered_list = FeatureTrack(filtered_set.size());
+  std::transform(filtered_set.begin(), filtered_set.end(),
+                 filtered_list.begin(),
+                 [](const auto& v) { return v.second.vertex; });
 
-    auto find_feature_vertex_at_pose(const FeatureTrack&,  //
-                                     const PoseVertex) const
-        -> std::optional<FeatureVertex>;
+  // Order feature vertices in a chronological order.
+  //
+  // The camera vertex ID is incremented as time goes on and can be seen as a
+  // timestep.
+  std::sort(filtered_list.begin(), filtered_list.end(),
+            [this](const auto u, const auto v) {
+              return _feature_graph[u].pose_vertex <
+                     _feature_graph[v].pose_vertex;
+            });
 
-    auto retrieve_scene_point_color(
-        const ScenePoint::Coords& scene_point_coords,      //
-        const ImageView<Rgb8>& image,                      //
-        const QuaternionBasedPose<double>& absolute_pose,  //
-        const v2::BrownConradyDistortionModel<double>& camera) const
-        -> ScenePoint::Color;
+  return filtered_list;
+}
 
-  private:
-    const CameraPoseGraph& _pose_graph;
-    const FeatureGraph& _feature_graph;
-    PointCloud& _point_cloud;
+auto PointCloudGenerator::find_feature_vertex_at_pose(
+    const FeatureTrack& track, const PoseVertex pose_vertex) const
+    -> std::optional<FeatureVertex>
+{
+  auto v = std::find_if(track.begin(), track.end(),
+                        [this, pose_vertex](const auto& v) {
+                          return this->gid(v).pose_vertex == pose_vertex;
+                        });
+  return v == track.end() ? std::nullopt : std::make_optional(*v);
+}
 
-    std::unordered_map<FeatureVertex, ScenePointIndex>
-        _from_vertex_to_scene_point_index;
-  };
 
-}  // namespace DO::Sara
+auto PointCloudGenerator::barycenter(
+    const std::vector<ScenePointIndex>& scene_point_indices) const -> ScenePoint
+{
+  if (scene_point_indices.empty())
+    throw std::runtime_error{"Error: cannot calculate a barycentric scene "
+                             "point from an empty list of scene point indices"};
+  static const ScenePoint::Value zero = ScenePoint::Value::Zero();
+  auto bary = std::accumulate(  //
+      scene_point_indices.begin(), scene_point_indices.end(), zero,
+      [this](const ScenePoint::Value& a,
+             const ScenePointIndex bi) -> ScenePoint::Value {
+        const ScenePoint::Value& b = _point_cloud[bi];
+        return a + b;
+      });
+  bary /= scene_point_indices.size();
+
+  return bary;
+}
+
+auto PointCloudGenerator::split_by_scene_point_knowledge(
+    const std::vector<FeatureTrack>& tracks) const
+    -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>
+{
+  auto& logger = Logger::get();
+
+  auto tracks_with_known_scene_point = std::vector<FeatureTrack>{};
+  auto tracks_with_unknown_scene_point = std::vector<FeatureTrack>{};
+  tracks_with_known_scene_point.reserve(tracks.size());
+  tracks_with_unknown_scene_point.reserve(tracks.size());
+
+  SARA_LOGD(logger, "Splitting feature tracks by knowledge of scene point...");
+
+  for (const auto& track : tracks)
+  {
+    const auto scene_point_indices = list_scene_point_indices(track);
+    if (scene_point_indices.empty())
+      tracks_with_unknown_scene_point.emplace_back(track);
+    else
+      tracks_with_known_scene_point.emplace_back(track);
+  }
+
+  SARA_LOGD(logger, "Tracks: {}", tracks.size());
+  SARA_LOGD(logger, "Tracks with known   scene point: {}",
+            tracks_with_known_scene_point.size());
+  SARA_LOGD(logger, "Tracks with unknown scene point: {}",
+            tracks_with_unknown_scene_point.size());
+
+  return std::make_pair(tracks_with_known_scene_point,
+                        tracks_with_unknown_scene_point);
+}
+
+auto PointCloudGenerator::retrieve_scene_point_color(
+    const Eigen::Vector3d& scene_point,  //
+    const ImageView<Rgb8>& image,        //
+    const QuaternionBasedPose<double>& pose,
+    const v2::BrownConradyDistortionModel<double>& camera) const -> Rgb64f
+{
+  const auto& w = image.width();
+  const auto& h = image.height();
+
+  // Its coordinates in the camera frame.
+  const auto camera_point = pose * scene_point;
+
+  // Its corresponding pixel coordinates in the image.
+  const Eigen::Vector2i u = camera
+                                .project(camera_point)  //
+                                .array()
+                                .round()
+                                .cast<int>();
+
+  // Clamp for safety
+  // TODO: do bilinear interpolation.
+  const auto x = std::clamp(u.x(), 0, w - 1);
+  const auto y = std::clamp(u.y(), 0, h - 1);
+
+  // N.B.: the image is an array of BGR values.
+  const auto& rgb8 = image(x, y);
+  // We store RGB values.
+  static constexpr auto normalization_factor = 1 / 255.;
+  const Rgb64f rgb64f = rgb8.cast<double>() * normalization_factor;
+
+  return rgb64f;
+}
+
+
+auto PointCloudGenerator::seed_point_cloud(
+    const std::vector<FeatureTrack>& tracks,
+    const ImageView<Rgb8>& image,  //
+    const PoseEdge pose_edge,
+    const v2::BrownConradyDistortionModel<double>& camera) -> void
+{
+  auto& logger = Logger::get();
+
+  SARA_LOGD(logger, "Transform feature tracks into best feature pairs...");
+  const auto pose_u = _pose_graph.source(pose_edge);
+  const auto pose_v = _pose_graph.target(pose_edge);
+  const auto& tsfm_u = _pose_graph[pose_u].pose;
+  const auto& tsfm_v = _pose_graph[pose_v].pose;
+  SARA_LOGD(logger, "Pose[{}]:\n{}", pose_u, tsfm_u.matrix34());
+  SARA_LOGD(logger, "Pose[{}]:\n{}", pose_v, tsfm_v.matrix34());
+
+  const auto num_tracks = static_cast<Eigen::Index>(tracks.size());
+
+  using FeatureVertexPair = std::array<FeatureVertex, 2>;
+  auto matches = std::vector<FeatureVertexPair>(num_tracks);
+  std::transform(
+      tracks.begin(), tracks.end(), matches.begin(),
+      [this, pose_u, pose_v](const FeatureTrack& track) -> FeatureVertexPair {
+        // Non-maximum suppression.
+        //
+        // We do need to filter the feature tracks by non-maximum suppression.
+        //
+        // Even in the case where the pose graph contains only 2 views, feature
+        // matches can be merged into components of cardinality larger than 2.
+        const auto track_filtered = filter_by_non_max_suppression(track);
+        if (track_filtered.size() != 2)
+          throw std::runtime_error{
+              "Error: the NMS-filtered feature track must have cardinality 2!"};
+
+        // Retrieve the cleaned up feature correspondence.
+        const auto fu = find_feature_vertex_at_pose(track_filtered, pose_u);
+        const auto fv = find_feature_vertex_at_pose(track_filtered, pose_v);
+        if (!fu.has_value() || !fv.has_value())
+          throw std::runtime_error{
+              "Error: the feature match must exist in the graph!"};
+
+        return {*fu, *fv};
+      });
+
+  SARA_LOGD(logger, "Calculating ray pairs from feature pairs...");
+  auto rays_u = Eigen::MatrixXd{3, num_tracks};
+  auto rays_v = Eigen::MatrixXd{3, num_tracks};
+  for (auto t = 0u; t < num_tracks; ++t)
+  {
+    // Collect the feature match '(x, y)'.
+    const auto& [x, y] = matches[t];
+    const auto x_coords = pixel_coords(x).cast<double>();
+    const auto y_coords = pixel_coords(y).cast<double>();
+
+    // Backproject the pixel coordinates to their corresponding incident rays on
+    // the camera plane.
+    rays_u.col(t) = camera.backproject(x_coords);
+    rays_v.col(t) = camera.backproject(y_coords);
+  }
+
+  // Calculate the associated triangulation.
+  SARA_LOGD(logger, "Initialization the point cloud by 3D triangulation from "
+                    "the relative pose...");
+  const auto motion = normalized_camera(_pose_graph[pose_edge].motion).matrix();
+  if ((tsfm_v.matrix34() - motion).norm() > 1e-6)
+    throw std::runtime_error{"Error: the target abs pose must be initialized "
+                             "as the relative motion!"};
+
+  const auto [X, scales_u, scales_v] = triangulate_linear_eigen(  //
+      tsfm_u.matrix34(), tsfm_v.matrix34(),                       //
+      rays_u, rays_v);
+
+  // Allocate the mapping from the feature vertices to the scene point index.
+  if (!_from_vertex_to_scene_point_index.empty())
+    _from_vertex_to_scene_point_index.clear();
+
+  // Calculate the initial point cloud.
+  if (!_point_cloud.empty())
+    _point_cloud.clear();
+
+  auto scene_point_index = ScenePointIndex{};
+  for (auto j = 0; j < X.cols(); ++j)
+  {
+    // Only consider **cheiral** inliers:
+    //
+    // The triangulated 3D points must be in front of the two cameras!
+    if (!(scales_u(j) > 0 && scales_v(j) > 0))
+      continue;
+
+    // Calculate the scene point.
+    const Eigen::Vector3d coords = X.col(j).hnormalized();
+    const auto color = retrieve_scene_point_color(coords, image,  //
+                                                  tsfm_v, camera);
+
+    // Store the scene point to the point cloud.
+    auto scene_point_value = ScenePoint::Value{};
+    scene_point_value << coords, color;
+    _point_cloud.emplace_back(scene_point_value);
+
+    // Recall that a match is a pair of feature vertex.
+    const auto& [x, y] = matches[j];
+
+    // Assign a scene point index to the two feature vertices.
+    _from_vertex_to_scene_point_index[x] = scene_point_index;
+    _from_vertex_to_scene_point_index[y] = scene_point_index;
+    ++scene_point_index;
+  }
+
+  SARA_LOGD(logger, "point cloud: {} 3D points", _point_cloud.size());
+}
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
index 910006d0c..81aff7072 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
@@ -50,7 +50,7 @@ namespace DO::Sara {
     auto split_by_scene_point_knowledge(const std::vector<FeatureTrack>&) const
         -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>;
 
-    auto init_point_cloud(const std::vector<FeatureTrack>&,  //
+    auto seed_point_cloud(const std::vector<FeatureTrack>&,  //
                           const ImageView<Rgb8>&,            //
                           const PoseEdge,
                           const v2::BrownConradyDistortionModel<double>&)
@@ -69,6 +69,11 @@ namespace DO::Sara {
       return f[feature_index];
     }
 
+    auto pixel_coords(const FeatureVertex u) const -> const Eigen::Vector2f&
+    {
+      return feature(u).center();
+    }
+
     auto barycenter(const std::vector<ScenePointIndex>&) const -> ScenePoint;
 
     auto find_feature_vertex_at_pose(const FeatureTrack&,  //

From f06936ca5f801f04cae88cc6c5c766d4a124cbe6 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Mon, 15 Apr 2024 20:44:09 +0100
Subject: [PATCH 37/49] WIP: save work.

---
 .../BuildingBlocks/PointCloudGenerator.cpp    | 89 +++++++++++++++++++
 .../BuildingBlocks/PointCloudGenerator.hpp    | 42 ++++++---
 cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp  | 20 ++---
 .../Sara/SfM/OdometryV2/OdometryPipeline.cpp  | 52 +++++++----
 4 files changed, 160 insertions(+), 43 deletions(-)

diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
index 836ebe003..56b478af6 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
@@ -289,3 +289,92 @@ auto PointCloudGenerator::seed_point_cloud(
 
   SARA_LOGD(logger, "point cloud: {} 3D points", _point_cloud.size());
 }
+
+auto PointCloudGenerator::propagate_scene_point_indices(
+    const std::vector<FeatureTrack>& tracks) -> void
+{
+  auto& logger = Logger::get();
+
+  SARA_LOGI(logger,
+            "Propagating scene point indices to new feature vertices...");
+
+  for (const auto& track : tracks)
+  {
+    const auto scene_point_indices = list_scene_point_indices(track);
+    if (scene_point_indices.empty())
+      continue;
+
+#if defined(DEBUG_ME)
+    if (scene_point_indices.size() > 1)
+    {
+      SARA_LOGT(logger, "Found a fused feature track...");
+
+      using ScenePointIndexVector = Eigen::RowVector<  //
+          ScenePointIndex, Eigen::Dynamic>;
+      using FeatureVerticesAsVector = Eigen::Map<
+          const Eigen::RowVector<FeatureVertexIndex, Eigen::Dynamic>>;
+      using ScenePointIndicesAsVector = Eigen::Map<  //
+          const Eigen::RowVector<ScenePointIndex, Eigen::Dynamic>>;
+
+      const ScenePointIndexVector track_vector =
+          FeatureVerticesAsVector(track.data(), track.size())
+              .cast<ScenePointIndex>();
+
+      const ScenePointIndexVector scene_index_vector =
+          ScenePointIndicesAsVector(scene_point_indices.data(),
+                                    scene_point_indices.size());
+      SARA_LOGT(logger, "track indices: {}", track_vector);
+      SARA_LOGT(logger, "scene point indices: {}", scene_index_vector);
+
+      for (const auto& i : scene_point_indices)
+        SARA_LOGT(logger, "scene coords[{}]: {}", i,
+                  Eigen::RowVector3d(_point_cloud[i].coords().transpose()));
+    }
+#endif
+
+    // 1. Calculating the barycentric scene point coordinates to disambiguate
+    // the cluster of scene points.
+    const auto scene_point = barycenter(scene_point_indices);
+    for (const auto& i : scene_point_indices)
+      _point_cloud[i] = scene_point;
+
+    // 2. Assigning a unique scene point index for each vertex of the feature
+    // track.
+    const auto& scene_point_index = scene_point_indices.front();
+    for (const auto& v : track)
+      _from_vertex_to_scene_point_index[v] = scene_point_index;
+  }
+}
+
+auto PointCloudOperator::compress_point_cloud(
+    const std::vector<FeatureTrack>& tracks) -> bool
+{
+  auto& logger = Logger::get();
+  SARA_LOGI(logger, "Compressing the point cloud...");
+
+  // Calculate the barycentric scene point for a given feature track.
+  auto point_cloud_compressed = std::vector<ScenePoint>{};
+  point_cloud_compressed.reserve(tracks.size());
+
+  // Reset the scene point index for each feature track.
+  for (auto t = ScenePointIndex{}; t < tracks.size(); ++t)
+  {
+    const auto scene_point_indices = list_scene_point_indices(tracks[t]);
+    if (scene_point_indices.empty())
+      continue;
+
+    // Reassign the scene point index for the given feature track.
+    for (const auto& v : track)
+      _from_vertex_to_scene_point_index[v] = t;
+
+    // Recalculate the scene point index as a barycenter.
+    const auto scene_point =
+        barycenter(scene_point_indices)
+            point_cloud_compressed.emplace_back(scene_point);
+  }
+
+  // Swap the point cloud with the set of barycenters.
+  std::swap(_point_cloud, point_cloud_compressed);
+
+  return true;
+}
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
index 81aff7072..fea06f4fe 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
@@ -20,10 +20,13 @@ namespace DO::Sara {
 
   class PointCloudGenerator
   {
-  public:
+  public: /* aliases */
     using PoseVertex = CameraPoseGraph::Vertex;
     using PoseEdge = CameraPoseGraph::Edge;
+
     using FeatureVertex = FeatureGraph::Vertex;
+    using FeatureVertexIndex = FeatureGraph::VertexIndex;
+
     using ScenePointIndex = std::size_t;
     using ScenePoint = RgbColoredPoint<double>;
 
@@ -32,6 +35,7 @@ namespace DO::Sara {
     using FeatureToScenePointMap = std::unordered_map<FeatureVertex,  //
                                                       ScenePointIndex>;
 
+  public: /* main interface */
     PointCloudGenerator(const CameraPoseGraph& camera_pose_graph,
                         const FeatureGraph& feature_graph,
                         PointCloud& point_cloud)
@@ -41,22 +45,13 @@ namespace DO::Sara {
     {
     }
 
-    auto list_scene_point_indices(const FeatureTrack&) const
-        -> std::vector<ScenePointIndex>;
-
-    auto filter_by_non_max_suppression(const FeatureTrack&) const  //
-        -> FeatureTrack;
-
-    auto split_by_scene_point_knowledge(const std::vector<FeatureTrack>&) const
-        -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>;
-
     auto seed_point_cloud(const std::vector<FeatureTrack>&,  //
                           const ImageView<Rgb8>&,            //
                           const PoseEdge,
                           const v2::BrownConradyDistortionModel<double>&)
         -> void;
 
-  public: /* utility methods */
+  public: /* helper feature retrieval methods */
     auto gid(const FeatureVertex u) const -> const FeatureGID&
     {
       return _feature_graph[u];
@@ -74,7 +69,9 @@ namespace DO::Sara {
       return feature(u).center();
     }
 
-    auto barycenter(const std::vector<ScenePointIndex>&) const -> ScenePoint;
+  public: /* helper query methods */
+    auto list_scene_point_indices(const FeatureTrack&) const
+        -> std::vector<ScenePointIndex>;
 
     auto find_feature_vertex_at_pose(const FeatureTrack&,  //
                                      const PoseVertex) const
@@ -86,7 +83,26 @@ namespace DO::Sara {
         const QuaternionBasedPose<double>& pose,
         const v2::BrownConradyDistortionModel<double>& camera) const -> Rgb64f;
 
-  private:
+  public: /* data transformation methods */
+    auto barycenter(const std::vector<ScenePointIndex>&) const -> ScenePoint;
+
+    auto filter_by_non_max_suppression(const FeatureTrack&) const  //
+        -> FeatureTrack;
+
+    auto split_by_scene_point_knowledge(const std::vector<FeatureTrack>&) const
+        -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>;
+
+    auto propagate_scene_point_indices(const std::vector<FeatureTrack>&)
+        -> void;
+
+    //! - The point cloud compression reassigns a unique scene point cloud to
+    //!   each feature tracks.
+    //! - The scene point is recalculated as a the barycenter of the
+    //!   possibly multiple scene points we have found after recalculating the
+    //!   feature tracks.
+    auto compress_point_cloud(const std::vector<FeatureTrack>&) -> bool;
+
+  private: /* data members */
     const CameraPoseGraph& _pose_graph;
     const FeatureGraph& _feature_graph;
     PointCloud& _point_cloud;
diff --git a/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp b/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp
index 88095b0d3..611a8c298 100644
--- a/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp
+++ b/cpp/src/DO/Sara/SfM/Graph/FeatureTracker.cpp
@@ -19,21 +19,18 @@
 using namespace DO::Sara;
 
 auto FeatureTracker::update_feature_tracks(
-    const CameraPoseGraph& pose_graph,
-    const CameraPoseGraph::Edge pose_edge) -> void
+    const CameraPoseGraph& pose_graph, const CameraPoseGraph::Edge pose_edge)
+    -> void
 {
   auto& logger = Logger::get();
 
-  const CameraPoseGraph::Impl& pg = pose_graph;
-  FeatureGraph::Impl& fg = _feature_graph;
-
   // Retrieve the camera poses from the relative pose edge.
-  const auto pose_u = boost::source(pose_edge, pg);
-  const auto pose_v = boost::target(pose_edge, pg);
+  const auto pose_u = pose_graph.source(pose_edge);
+  const auto pose_v = pose_graph.target(pose_edge);
   // The relative pose edge contains the set of all feature correspondences.
-  const auto& matches = pg[pose_edge].matches;
+  const auto& matches = pose_graph[pose_edge].matches;
   // Which of these feature correspondences are marked as inliers?
-  const auto& inliers = pg[pose_edge].inliers;
+  const auto& inliers = pose_graph[pose_edge].inliers;
 
   // Add the feature graph edges.
   //
@@ -70,6 +67,7 @@ auto FeatureTracker::update_feature_tracks(
     const auto y_does_not_exist_yet = it_y == _feature_vertex.end();
 
     // If not, add them if necessary.
+    FeatureGraph::Impl& fg = _feature_graph;
     const auto x = x_does_not_exist_yet ? boost::add_vertex(fg) : it_x->second;
     const auto y = y_does_not_exist_yet ? boost::add_vertex(fg) : it_y->second;
 
@@ -88,8 +86,8 @@ auto FeatureTracker::update_feature_tracks(
     // navigate between the feature graph to the pose graph.
     const auto [xy, xy_added] = boost::add_edge(x, y, fg);
     auto& xy_attrs = fg[xy];
-    xy_attrs.pose_src = boost::source(pose_edge, pg);
-    xy_attrs.pose_dst = boost::target(pose_edge, pg);
+    xy_attrs.pose_src = pose_graph.source(pose_edge);
+    xy_attrs.pose_dst = pose_graph.target(pose_edge);
     xy_attrs.index = m;
   }
 
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
index 5cc32068a..9dfcf3359 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
@@ -120,9 +120,9 @@ auto v2::OdometryPipeline::add_camera_pose() -> bool
   // Detect and describe the local features.
   _pose_prev = _pose_curr;
 
-  const auto frame = _distortion_corrector->frame_gray32f();
+  const auto frame_gray32f = _distortion_corrector->frame_gray32f();
   const auto frame_number = _video_streamer.frame_number();
-  auto keys_curr = detect_keypoints(frame);
+  auto keys_curr = detect_keypoints(frame_gray32f);
 
   // Boundary case: the graphs are empty.
   if (_pose_graph.num_vertices() == 0)
@@ -176,27 +176,41 @@ auto v2::OdometryPipeline::add_camera_pose() -> bool
     std::tie(_tracks_alive, _track_visibility_count) =
         _feature_tracker.calculate_alive_feature_tracks(_pose_curr);
 
-    // 4. TODO: Init point cloud
-
-    // 5. TODO: don't add 3D scene points that are too far, like point in the
-    //    sky
+    // 4. Initialize the point cloud.
+    //
+    // TODO: don't add 3D scene points that are too far, like point in the
+    // sky
+    const auto frame_rgb8 = _distortion_corrector->frame_rgb8();
+    _point_cloud_generator->seed_point_cloud(_tracks_alive, frame_rgb8,
+                                             pose_edge, _camera);
 
     return true;
   }
 
-  // 1. Grow the feature graph first by adding the feature matches that are
-  //    deemed reliable from the relative pose estimation.
-  // 2. Recalculate the feature tracks.
-  // 3. Get the feature tracks that are still alive.
-  // 4. For each feature track still alive, get the corresponding scene
-  //    points.
-  //    Each alive feature track still has the same old feature IDs in the
-  //    previous image frames, and we know their scene points.
-  //    Use triangulation computer vision task, to calculate the new camera
-  //    absolute pose.
-  // 5. With the camera absolute pose, add the new scene points.
-  //    Specifically, they are the alive feature tracks (with cardinality 2)
-  //    for which we don't know the scene points yet.
+  // 1. Update the feature tracks by adding the feature matches that are
+  //    verified by the relative pose estimation.
+  const auto pose_edge = _pose_graph.add_relative_pose(  //
+      _pose_prev, _pose_curr,                            //
+      std::move(rel_pose_data));
+  _feature_tracker.update_feature_tracks(_pose_graph, pose_edge);
+
+  // 2. Recalculate the feature tracks that are still alive.
+  std::tie(_tracks_alive, _track_visibility_count) =
+      _feature_tracker.calculate_alive_feature_tracks(_pose_curr);
+
+  // 2. Propagate the scene point to the feature tracks that grew longer.
+  //    The feature tracks that grew longer can only be those among the tracks
+  //    still alive.
+  SARA_LOGI(logger, "Propagating the scene points to new features...");
+  _point_cloud_generator->propagate_scene_point_indices(_tracks_alive);
+
+  // 3. Reassign a unique scene point cloud to each feature tracks by
+  //    compressing the point cloud.
+  SARA_LOGI(logger, "Compressing the point cloud...");
+  _point_cloud_generator->compress_point_cloud(
+      _feature_tracker._feature_tracks);
+
+  // 4. Determine the current absolute pose from the alive tracks.
 
   // TODO: Grow point cloud by triangulation.
   return false;

From 44a55c4bb2e1c1ca9d9288e344c4f8b68643dd64 Mon Sep 17 00:00:00 2001
From: Odd Kiva <2375733-oddkiva@users.noreply.gitlab.com>
Date: Mon, 15 Apr 2024 21:58:12 +0100
Subject: [PATCH 38/49] WIP: save work.

---
 .../BuildingBlocks/PointCloudGenerator.cpp    | 17 +++++------
 .../BuildingBlocks/PointCloudGenerator.hpp    | 29 ++++++++++++++-----
 .../SfM/BuildingBlocks/RgbColoredPoint.hpp    |  5 ++++
 3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
index 56b478af6..5f4925b59 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
@@ -83,8 +83,8 @@ auto PointCloudGenerator::filter_by_non_max_suppression(
 }
 
 auto PointCloudGenerator::find_feature_vertex_at_pose(
-    const FeatureTrack& track, const PoseVertex pose_vertex) const
-    -> std::optional<FeatureVertex>
+    const FeatureTrack& track,
+    const PoseVertex pose_vertex) const -> std::optional<FeatureVertex>
 {
   auto v = std::find_if(track.begin(), track.end(),
                         [this, pose_vertex](const auto& v) {
@@ -93,7 +93,6 @@ auto PointCloudGenerator::find_feature_vertex_at_pose(
   return v == track.end() ? std::nullopt : std::make_optional(*v);
 }
 
-
 auto PointCloudGenerator::barycenter(
     const std::vector<ScenePointIndex>& scene_point_indices) const -> ScenePoint
 {
@@ -178,7 +177,6 @@ auto PointCloudGenerator::retrieve_scene_point_color(
   return rgb64f;
 }
 
-
 auto PointCloudGenerator::seed_point_cloud(
     const std::vector<FeatureTrack>& tracks,
     const ImageView<Rgb8>& image,  //
@@ -346,7 +344,7 @@ auto PointCloudGenerator::propagate_scene_point_indices(
   }
 }
 
-auto PointCloudOperator::compress_point_cloud(
+auto PointCloudGenerator::compress_point_cloud(
     const std::vector<FeatureTrack>& tracks) -> bool
 {
   auto& logger = Logger::get();
@@ -359,7 +357,9 @@ auto PointCloudOperator::compress_point_cloud(
   // Reset the scene point index for each feature track.
   for (auto t = ScenePointIndex{}; t < tracks.size(); ++t)
   {
-    const auto scene_point_indices = list_scene_point_indices(tracks[t]);
+    const auto& track = tracks[t];
+
+    const auto scene_point_indices = list_scene_point_indices(track);
     if (scene_point_indices.empty())
       continue;
 
@@ -368,9 +368,8 @@ auto PointCloudOperator::compress_point_cloud(
       _from_vertex_to_scene_point_index[v] = t;
 
     // Recalculate the scene point index as a barycenter.
-    const auto scene_point =
-        barycenter(scene_point_indices)
-            point_cloud_compressed.emplace_back(scene_point);
+    const auto scene_point = barycenter(scene_point_indices);
+    point_cloud_compressed.emplace_back(scene_point);
   }
 
   // Swap the point cloud with the set of barycenters.
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
index fea06f4fe..9d011df5f 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
@@ -45,11 +45,11 @@ namespace DO::Sara {
     {
     }
 
-    auto seed_point_cloud(const std::vector<FeatureTrack>&,  //
-                          const ImageView<Rgb8>&,            //
-                          const PoseEdge,
-                          const v2::BrownConradyDistortionModel<double>&)
-        -> void;
+    auto
+    seed_point_cloud(const std::vector<FeatureTrack>&,  //
+                     const ImageView<Rgb8>&,            //
+                     const PoseEdge,
+                     const v2::BrownConradyDistortionModel<double>&) -> void;
 
   public: /* helper feature retrieval methods */
     auto gid(const FeatureVertex u) const -> const FeatureGID&
@@ -84,16 +84,31 @@ namespace DO::Sara {
         const v2::BrownConradyDistortionModel<double>& camera) const -> Rgb64f;
 
   public: /* data transformation methods */
+    //! @brief Calculate the barycentric scene point.
+    //!
+    //! We expect the array of scene point indices to be originated from a
+    //! feature track.
     auto barycenter(const std::vector<ScenePointIndex>&) const -> ScenePoint;
 
+    //! @brief A track is a sequence of feature.
     auto filter_by_non_max_suppression(const FeatureTrack&) const  //
         -> FeatureTrack;
 
+    //! @brief Split the list of feature tracks into two lists.
+    //!
+    //! The first list contains the tracks for which a scene point is calculated.
+    //! The second list contains the tracks for which a scene point is not yet
+    //! calculated.
     auto split_by_scene_point_knowledge(const std::vector<FeatureTrack>&) const
         -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>;
 
-    auto propagate_scene_point_indices(const std::vector<FeatureTrack>&)
-        -> void;
+    //! - The point cloud compression reassigns a unique scene point cloud to
+    //!   each feature tracks.
+    //! - The scene point is recalculated as a the barycenter of the
+    //!   possibly multiple scene points we have found after recalculating the
+    //!   feature tracks.
+    auto
+    propagate_scene_point_indices(const std::vector<FeatureTrack>&) -> void;
 
     //! - The point cloud compression reassigns a unique scene point cloud to
     //!   each feature tracks.
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp
index 8e94875ab..b57fac448 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/RgbColoredPoint.hpp
@@ -15,6 +15,11 @@ namespace DO::Sara {
 
     RgbColoredPoint() = default;
 
+    RgbColoredPoint(const Coords& coords, const Color& color)
+    {
+      _v << coords, color;
+    }
+
     RgbColoredPoint(const Value& v)
       : _v{v}
     {

From 4f1d0524d99df6842b3e1972d05867bf73c980e6 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Mon, 15 Apr 2024 22:41:20 +0100
Subject: [PATCH 39/49] WIP: save work.

---
 .../visual_odometry_example_v2.cpp            |  18 ++-
 .../Sara/SfM/OdometryV2/OdometryPipeline.cpp  | 115 +++++++++---------
 .../Sara/SfM/OdometryV2/OdometryPipeline.hpp  |   8 +-
 3 files changed, 79 insertions(+), 62 deletions(-)

diff --git a/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp b/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
index 4d4d24aa0..0c51d4f60 100644
--- a/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
+++ b/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
@@ -18,6 +18,7 @@
 #include <DO/Kalpana/Math/Projection.hpp>
 #include <DO/Kalpana/Math/Viewport.hpp>
 
+#include <DO/Sara/Logging/Logger.hpp>
 #include <DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp>
 
 #if defined(_WIN32)
@@ -170,8 +171,21 @@ class SingleWindowApp
 
   auto upload_point_cloud_data_to_opengl() -> void
   {
-    // _point_cloud.upload_host_data_to_gl(
-    //     _pipeline._triangulator->_colored_point_cloud);
+    const auto& point_cloud = _pipeline.point_cloud();
+    const auto ptr =
+        const_cast<sara::PointCloudGenerator::ScenePoint*>(point_cloud.data());
+    const auto ptrd = reinterpret_cast<double*>(ptr);
+
+    const auto num_points = static_cast<int>(point_cloud.size());
+    static constexpr auto dim = 6;
+    const auto pc_tview = sara::TensorView_<double, 2>{
+        ptrd,              //
+        {num_points, dim}  //
+    };
+
+    auto& logger = sara::Logger::get();
+    SARA_LOGW(logger, "point cloud dimensions: {} ", pc_tview.sizes());
+    _point_cloud.upload_host_data_to_gl(pc_tview.cast<float>());
   }
 
   auto render_video() -> void
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
index 9dfcf3359..a814cd946 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
@@ -54,7 +54,7 @@ auto v2::OdometryPipeline::process() -> void
 
   _distortion_corrector->undistort();
 
-  add_camera_pose();
+  grow_geometry();
 }
 
 auto v2::OdometryPipeline::make_display_frame() const -> Image<Rgb8>
@@ -113,7 +113,7 @@ auto v2::OdometryPipeline::estimate_relative_pose(
   return res;
 }
 
-auto v2::OdometryPipeline::add_camera_pose() -> bool
+auto v2::OdometryPipeline::grow_geometry() -> bool
 {
   auto& logger = Logger::get();
 
@@ -151,67 +151,68 @@ auto v2::OdometryPipeline::add_camera_pose() -> bool
   }
   SARA_LOGI(logger, "[SfM] Relative pose succeeded!");
 
-  if (_pose_graph.num_vertices() == 1)
-  {
-    auto abs_pose_curr = QuaternionBasedPose<double>{
-        .q = Eigen::Quaterniond{rel_pose_data.motion.R},
-        .t = rel_pose_data.motion.t  //
-    };
-
-    auto abs_pose_data = AbsolutePoseData{
-        frame_number,             //
-        std::move(keys_curr),     //
-        std::move(abs_pose_curr)  //
-    };
+  // if (_pose_graph.num_vertices() == 1)
+  // {
+  auto abs_pose_curr = QuaternionBasedPose<double>{
+      .q = Eigen::Quaterniond{rel_pose_data.motion.R},
+      .t = rel_pose_data.motion.t  //
+  };
 
-    // 1. Add the absolute pose vertex.
-    _pose_graph.add_absolute_pose(std::move(abs_pose_data));
+  auto abs_pose_data = AbsolutePoseData{
+      frame_number,             //
+      std::move(keys_curr),     //
+      std::move(abs_pose_curr)  //
+  };
 
-    // 2. Add the pose edge, which will invalidate the relative pose data.
-    const auto pose_edge = _pose_graph.add_relative_pose(
-        _pose_prev, _pose_curr, std::move(rel_pose_data));
+  // 1. Add the absolute pose vertex.
+  _pose_graph.add_absolute_pose(std::move(abs_pose_data));
 
-    // 3. Grow the feature graph by adding the feature matches.
-    _feature_tracker.update_feature_tracks(_pose_graph, pose_edge);
-    std::tie(_tracks_alive, _track_visibility_count) =
-        _feature_tracker.calculate_alive_feature_tracks(_pose_curr);
+  // 2. Add the pose edge, which will invalidate the relative pose data.
+  const auto pose_edge = _pose_graph.add_relative_pose(
+      _pose_prev, _pose_curr, std::move(rel_pose_data));
 
-    // 4. Initialize the point cloud.
-    //
-    // TODO: don't add 3D scene points that are too far, like point in the
-    // sky
-    const auto frame_rgb8 = _distortion_corrector->frame_rgb8();
-    _point_cloud_generator->seed_point_cloud(_tracks_alive, frame_rgb8,
-                                             pose_edge, _camera);
-
-    return true;
-  }
-
-  // 1. Update the feature tracks by adding the feature matches that are
-  //    verified by the relative pose estimation.
-  const auto pose_edge = _pose_graph.add_relative_pose(  //
-      _pose_prev, _pose_curr,                            //
-      std::move(rel_pose_data));
+  // 3. Grow the feature graph by adding the feature matches.
   _feature_tracker.update_feature_tracks(_pose_graph, pose_edge);
-
-  // 2. Recalculate the feature tracks that are still alive.
   std::tie(_tracks_alive, _track_visibility_count) =
       _feature_tracker.calculate_alive_feature_tracks(_pose_curr);
 
-  // 2. Propagate the scene point to the feature tracks that grew longer.
-  //    The feature tracks that grew longer can only be those among the tracks
-  //    still alive.
-  SARA_LOGI(logger, "Propagating the scene points to new features...");
-  _point_cloud_generator->propagate_scene_point_indices(_tracks_alive);
-
-  // 3. Reassign a unique scene point cloud to each feature tracks by
-  //    compressing the point cloud.
-  SARA_LOGI(logger, "Compressing the point cloud...");
-  _point_cloud_generator->compress_point_cloud(
-      _feature_tracker._feature_tracks);
-
-  // 4. Determine the current absolute pose from the alive tracks.
-
-  // TODO: Grow point cloud by triangulation.
-  return false;
+  // 4. Initialize the point cloud.
+  //
+  // TODO: don't add 3D scene points that are too far, like point in the
+  // sky
+  const auto frame_rgb8 = _distortion_corrector->frame_rgb8();
+  _point_cloud_generator->seed_point_cloud(_tracks_alive, frame_rgb8, pose_edge,
+                                           _camera);
+
+  return true;
+  // }
+
+  // // 1. Update the feature tracks by adding the feature matches that are
+  // //    verified by the relative pose estimation.
+  // const auto pose_edge = _pose_graph.add_relative_pose(  //
+  //     _pose_prev, _pose_curr,                            //
+  //     std::move(rel_pose_data));
+  // _feature_tracker.update_feature_tracks(_pose_graph, pose_edge);
+
+  // // 2. Recalculate the feature tracks that are still alive.
+  // std::tie(_tracks_alive, _track_visibility_count) =
+  //     _feature_tracker.calculate_alive_feature_tracks(_pose_curr);
+
+  // // 2. Propagate the scene point to the feature tracks that grew longer.
+  // //    The feature tracks that grew longer can only be those among the
+  // tracks
+  // //    still alive.
+  // SARA_LOGI(logger, "Propagating the scene points to new features...");
+  // _point_cloud_generator->propagate_scene_point_indices(_tracks_alive);
+
+  // // 3. Reassign a unique scene point cloud to each feature tracks by
+  // //    compressing the point cloud.
+  // SARA_LOGI(logger, "Compressing the point cloud...");
+  // _point_cloud_generator->compress_point_cloud(
+  //     _feature_tracker._feature_tracks);
+
+  // // 4. Determine the current absolute pose from the alive tracks.
+
+  // // TODO: Grow point cloud by triangulation.
+  // return false;
 }
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
index 90c6a2ab3..e71e30b57 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.hpp
@@ -34,8 +34,10 @@ namespace DO::Sara::v2 {
 
     auto make_display_frame() const -> Image<Rgb8>;
 
-    auto detect_keypoints() -> const KeypointList<OERegion, float>&;
-    auto estimate_relative_pose() -> const RelativePoseData&;
+    auto point_cloud() const -> const PointCloudGenerator::PointCloud&
+    {
+      return _point_cloud;
+    }
 
   private: /* computer vision tasks */
     auto detect_keypoints(const ImageView<float>&) const
@@ -47,7 +49,7 @@ namespace DO::Sara::v2 {
         -> std::pair<RelativePoseData, TwoViewGeometry>;
 
   private: /* graph update tasks */
-    auto add_camera_pose() -> bool;
+    auto grow_geometry() -> bool;
 
   public: /* data members */
     VideoStreamer _video_streamer;

From cfbb7409f867b009bdd9396c897100e7b4f96eff Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Tue, 16 Apr 2024 12:17:16 +0100
Subject: [PATCH 40/49] WIP: deduplicated code.

---
 .../BuildingBlocks/PointCloudGenerator.cpp    | 241 ++++++++++--------
 .../BuildingBlocks/PointCloudGenerator.hpp    |  30 ++-
 .../Sara/SfM/OdometryV2/OdometryPipeline.cpp  |   4 +-
 3 files changed, 149 insertions(+), 126 deletions(-)

diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
index 5f4925b59..1dfaf5054 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
@@ -83,8 +83,8 @@ auto PointCloudGenerator::filter_by_non_max_suppression(
 }
 
 auto PointCloudGenerator::find_feature_vertex_at_pose(
-    const FeatureTrack& track,
-    const PoseVertex pose_vertex) const -> std::optional<FeatureVertex>
+    const FeatureTrack& track, const PoseVertex pose_vertex) const
+    -> std::optional<FeatureVertex>
 {
   auto v = std::find_if(track.begin(), track.end(),
                         [this, pose_vertex](const auto& v) {
@@ -177,117 +177,6 @@ auto PointCloudGenerator::retrieve_scene_point_color(
   return rgb64f;
 }
 
-auto PointCloudGenerator::seed_point_cloud(
-    const std::vector<FeatureTrack>& tracks,
-    const ImageView<Rgb8>& image,  //
-    const PoseEdge pose_edge,
-    const v2::BrownConradyDistortionModel<double>& camera) -> void
-{
-  auto& logger = Logger::get();
-
-  SARA_LOGD(logger, "Transform feature tracks into best feature pairs...");
-  const auto pose_u = _pose_graph.source(pose_edge);
-  const auto pose_v = _pose_graph.target(pose_edge);
-  const auto& tsfm_u = _pose_graph[pose_u].pose;
-  const auto& tsfm_v = _pose_graph[pose_v].pose;
-  SARA_LOGD(logger, "Pose[{}]:\n{}", pose_u, tsfm_u.matrix34());
-  SARA_LOGD(logger, "Pose[{}]:\n{}", pose_v, tsfm_v.matrix34());
-
-  const auto num_tracks = static_cast<Eigen::Index>(tracks.size());
-
-  using FeatureVertexPair = std::array<FeatureVertex, 2>;
-  auto matches = std::vector<FeatureVertexPair>(num_tracks);
-  std::transform(
-      tracks.begin(), tracks.end(), matches.begin(),
-      [this, pose_u, pose_v](const FeatureTrack& track) -> FeatureVertexPair {
-        // Non-maximum suppression.
-        //
-        // We do need to filter the feature tracks by non-maximum suppression.
-        //
-        // Even in the case where the pose graph contains only 2 views, feature
-        // matches can be merged into components of cardinality larger than 2.
-        const auto track_filtered = filter_by_non_max_suppression(track);
-        if (track_filtered.size() != 2)
-          throw std::runtime_error{
-              "Error: the NMS-filtered feature track must have cardinality 2!"};
-
-        // Retrieve the cleaned up feature correspondence.
-        const auto fu = find_feature_vertex_at_pose(track_filtered, pose_u);
-        const auto fv = find_feature_vertex_at_pose(track_filtered, pose_v);
-        if (!fu.has_value() || !fv.has_value())
-          throw std::runtime_error{
-              "Error: the feature match must exist in the graph!"};
-
-        return {*fu, *fv};
-      });
-
-  SARA_LOGD(logger, "Calculating ray pairs from feature pairs...");
-  auto rays_u = Eigen::MatrixXd{3, num_tracks};
-  auto rays_v = Eigen::MatrixXd{3, num_tracks};
-  for (auto t = 0u; t < num_tracks; ++t)
-  {
-    // Collect the feature match '(x, y)'.
-    const auto& [x, y] = matches[t];
-    const auto x_coords = pixel_coords(x).cast<double>();
-    const auto y_coords = pixel_coords(y).cast<double>();
-
-    // Backproject the pixel coordinates to their corresponding incident rays on
-    // the camera plane.
-    rays_u.col(t) = camera.backproject(x_coords);
-    rays_v.col(t) = camera.backproject(y_coords);
-  }
-
-  // Calculate the associated triangulation.
-  SARA_LOGD(logger, "Initialization the point cloud by 3D triangulation from "
-                    "the relative pose...");
-  const auto motion = normalized_camera(_pose_graph[pose_edge].motion).matrix();
-  if ((tsfm_v.matrix34() - motion).norm() > 1e-6)
-    throw std::runtime_error{"Error: the target abs pose must be initialized "
-                             "as the relative motion!"};
-
-  const auto [X, scales_u, scales_v] = triangulate_linear_eigen(  //
-      tsfm_u.matrix34(), tsfm_v.matrix34(),                       //
-      rays_u, rays_v);
-
-  // Allocate the mapping from the feature vertices to the scene point index.
-  if (!_from_vertex_to_scene_point_index.empty())
-    _from_vertex_to_scene_point_index.clear();
-
-  // Calculate the initial point cloud.
-  if (!_point_cloud.empty())
-    _point_cloud.clear();
-
-  auto scene_point_index = ScenePointIndex{};
-  for (auto j = 0; j < X.cols(); ++j)
-  {
-    // Only consider **cheiral** inliers:
-    //
-    // The triangulated 3D points must be in front of the two cameras!
-    if (!(scales_u(j) > 0 && scales_v(j) > 0))
-      continue;
-
-    // Calculate the scene point.
-    const Eigen::Vector3d coords = X.col(j).hnormalized();
-    const auto color = retrieve_scene_point_color(coords, image,  //
-                                                  tsfm_v, camera);
-
-    // Store the scene point to the point cloud.
-    auto scene_point_value = ScenePoint::Value{};
-    scene_point_value << coords, color;
-    _point_cloud.emplace_back(scene_point_value);
-
-    // Recall that a match is a pair of feature vertex.
-    const auto& [x, y] = matches[j];
-
-    // Assign a scene point index to the two feature vertices.
-    _from_vertex_to_scene_point_index[x] = scene_point_index;
-    _from_vertex_to_scene_point_index[y] = scene_point_index;
-    ++scene_point_index;
-  }
-
-  SARA_LOGD(logger, "point cloud: {} 3D points", _point_cloud.size());
-}
-
 auto PointCloudGenerator::propagate_scene_point_indices(
     const std::vector<FeatureTrack>& tracks) -> void
 {
@@ -377,3 +266,129 @@ auto PointCloudGenerator::compress_point_cloud(
 
   return true;
 }
+
+auto PointCloudGenerator::grow_point_cloud(
+    const std::vector<FeatureTrack>& ftracks_without_scene_point,
+    const ImageView<Rgb8>& image,  //
+    const PoseEdge pose_edge,
+    const v2::BrownConradyDistortionModel<double>& camera) -> void
+{
+  auto& logger = Logger::get();
+
+  SARA_LOGD(logger,
+            "Extracting the pairwise matches from the feature tracks...");
+  const auto& pose_u = _pose_graph.source(pose_edge);
+  const auto& pose_v = _pose_graph.target(pose_edge);
+  const auto& tsfm_u = _pose_graph[pose_u].pose;
+  const auto& tsfm_v = _pose_graph[pose_v].pose;
+  SARA_LOGD(logger, "Pose[{}]:\n{}", pose_u, tsfm_u.matrix34());
+  SARA_LOGD(logger, "Pose[{}]:\n{}", pose_v, tsfm_v.matrix34());
+
+  const auto num_tracks =
+      static_cast<Eigen::Index>(ftracks_without_scene_point.size());
+
+  using FeatureVertexPair = std::array<FeatureVertex, 2>;
+  auto fmatches = std::vector<FeatureVertexPair>(num_tracks);
+
+  SARA_LOGD(logger, "Calculating feature matches...");
+  std::transform(
+      ftracks_without_scene_point.begin(),
+      ftracks_without_scene_point.end(),  //
+      fmatches.begin(),
+      [this, pose_u, pose_v](const FeatureTrack& ftrack) -> FeatureVertexPair {
+        // Non-maximum suppression.
+        //
+        // We do need to filter the feature tracks by non-maximum suppression.
+        //
+        // Even in the case where the pose graph contains only 2 views, feature
+        // matches can be merged into components of cardinality larger than 2.
+        const auto ftrack_nms = filter_by_non_max_suppression(ftrack);
+
+        // N.B.: the feature track cannot have any scene point indices at this
+        // point.
+        const auto scene_point_indices = list_scene_point_indices(ftrack_nms);
+        if (!scene_point_indices.empty())
+          throw std::runtime_error{
+              "Error: the feature track cannot have any scene point index!"};
+
+      // N.B.: at this point a track of visibility count >= 3 can possibly
+      // have no scene point.
+      //
+      // This happens when the cheirality is not satisifed. When the
+      // cheirality is not satisfied, we choose not to assign a scene point to
+      // this feature track.
+#if defined(DEBUG_ME)
+        const Eigen::RowVector<ScenePointIndex, Eigen::Dynamic> feature_vector =
+            Eigen::Map<
+                const Eigen::RowVector<FeatureVertexIndex, Eigen::Dynamic>>(
+                track_filtered.data(), track_filtered.size());
+        SARA_LOGD(logger, "track indices: {}", feature_vector);
+#endif
+
+
+        // Retrieve the cleaned up feature correspondence.
+        const auto fu = find_feature_vertex_at_pose(ftrack_nms, pose_u);
+        const auto fv = find_feature_vertex_at_pose(ftrack_nms, pose_v);
+        if (!fu.has_value() || !fv.has_value())
+          throw std::runtime_error{
+              "Error: the feature match must exist in the feature graph!"};
+
+        return {*fu, *fv};
+      });
+
+  SARA_LOGD(logger,
+            "Calculating the backprojected rays from the feature matches...");
+  auto rays_u = Eigen::MatrixXd{3, num_tracks};
+  auto rays_v = Eigen::MatrixXd{3, num_tracks};
+  for (auto t = 0u; t < num_tracks; ++t)
+  {
+    // Collect the feature match '(x, y)'.
+    const auto& [x, y] = fmatches[t];
+    const auto x_coords = pixel_coords(x).cast<double>();
+    const auto y_coords = pixel_coords(y).cast<double>();
+
+    // Backproject the pixel coordinates to their corresponding incident rays on
+    // the camera plane.
+    rays_u.col(t) = camera.backproject(x_coords);
+    rays_v.col(t) = camera.backproject(y_coords);
+  }
+
+  // Calculate the associated triangulation.
+  SARA_LOGD(logger, "Triangulating from the backprojected rays...");
+  const auto [X, scales_u, scales_v] = triangulate_linear_eigen(  //
+      tsfm_u.matrix34(), tsfm_v.matrix34(),                       //
+      rays_u, rays_v);
+
+  SARA_LOGD(logger, "Adding new scene points to point cloud...");
+  SARA_LOGD(logger, "[BEFORE] point cloud: {} 3D points", _point_cloud.size());
+
+  // N.B.: start with the right offset for the scene point index.
+  auto scene_point_index = _point_cloud.size();
+  for (auto j = 0; j < X.cols(); ++j)
+  {
+    // Only consider **cheiral** inliers:
+    //
+    // The triangulated 3D points must be in front of the two cameras!
+    if (!(scales_u(j) > 0 && scales_v(j) > 0))
+      continue;
+
+    // Calculate the scene point.
+    const Eigen::Vector3d coords = X.col(j).hnormalized();
+    const auto color = retrieve_scene_point_color(coords, image,  //
+                                                  tsfm_v, camera);
+
+    // Store the scene point to the point cloud.
+    auto scene_point_value = ScenePoint{coords, color};
+    _point_cloud.emplace_back(std::move(scene_point_value));
+
+    // Recall that a match is a pair of feature vertex.
+    const auto& [x, y] = fmatches[j];
+
+    // Assign a scene point index to the two feature vertices.
+    _from_vertex_to_scene_point_index[x] = scene_point_index;
+    _from_vertex_to_scene_point_index[y] = scene_point_index;
+    ++scene_point_index;
+  }
+
+  SARA_LOGD(logger, "[AFTER ] point cloud: {} 3D points", _point_cloud.size());
+}
diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
index 9d011df5f..4865f9a94 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.hpp
@@ -35,7 +35,7 @@ namespace DO::Sara {
     using FeatureToScenePointMap = std::unordered_map<FeatureVertex,  //
                                                       ScenePointIndex>;
 
-  public: /* main interface */
+  public: /* constructor */
     PointCloudGenerator(const CameraPoseGraph& camera_pose_graph,
                         const FeatureGraph& feature_graph,
                         PointCloud& point_cloud)
@@ -45,12 +45,6 @@ namespace DO::Sara {
     {
     }
 
-    auto
-    seed_point_cloud(const std::vector<FeatureTrack>&,  //
-                     const ImageView<Rgb8>&,            //
-                     const PoseEdge,
-                     const v2::BrownConradyDistortionModel<double>&) -> void;
-
   public: /* helper feature retrieval methods */
     auto gid(const FeatureVertex u) const -> const FeatureGID&
     {
@@ -96,9 +90,9 @@ namespace DO::Sara {
 
     //! @brief Split the list of feature tracks into two lists.
     //!
-    //! The first list contains the tracks for which a scene point is calculated.
-    //! The second list contains the tracks for which a scene point is not yet
-    //! calculated.
+    //! The first list contains the tracks for which a scene point is
+    //! calculated. The second list contains the tracks for which a scene point
+    //! is not yet calculated.
     auto split_by_scene_point_knowledge(const std::vector<FeatureTrack>&) const
         -> std::pair<std::vector<FeatureTrack>, std::vector<FeatureTrack>>;
 
@@ -107,8 +101,8 @@ namespace DO::Sara {
     //! - The scene point is recalculated as a the barycenter of the
     //!   possibly multiple scene points we have found after recalculating the
     //!   feature tracks.
-    auto
-    propagate_scene_point_indices(const std::vector<FeatureTrack>&) -> void;
+    auto propagate_scene_point_indices(const std::vector<FeatureTrack>&)
+        -> void;
 
     //! - The point cloud compression reassigns a unique scene point cloud to
     //!   each feature tracks.
@@ -117,6 +111,18 @@ namespace DO::Sara {
     //!   feature tracks.
     auto compress_point_cloud(const std::vector<FeatureTrack>&) -> bool;
 
+    //! Grow the point cloud becomes possible when the most recent absolute pose
+    //! is known.
+    //!
+    //! This calculates the new 3D scene points. Specifically the new 3D scene
+    //! points are those calculated from the feature tracks for which we didn't
+    //! know their scene point values.
+    auto grow_point_cloud(
+        const std::vector<FeatureTrack>& feature_tracks_without_scene_point,
+        const ImageView<Rgb8>& image,  //
+        const PoseEdge pose_edge,
+        const v2::BrownConradyDistortionModel<double>& camera) -> void;
+
   private: /* data members */
     const CameraPoseGraph& _pose_graph;
     const FeatureGraph& _feature_graph;
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
index a814cd946..644bc8228 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
@@ -181,7 +181,7 @@ auto v2::OdometryPipeline::grow_geometry() -> bool
   // TODO: don't add 3D scene points that are too far, like point in the
   // sky
   const auto frame_rgb8 = _distortion_corrector->frame_rgb8();
-  _point_cloud_generator->seed_point_cloud(_tracks_alive, frame_rgb8, pose_edge,
+  _point_cloud_generator->grow_point_cloud(_tracks_alive, frame_rgb8, pose_edge,
                                            _camera);
 
   return true;
@@ -214,5 +214,7 @@ auto v2::OdometryPipeline::grow_geometry() -> bool
   // // 4. Determine the current absolute pose from the alive tracks.
 
   // // TODO: Grow point cloud by triangulation.
+  // _point_cloud_generator->grow_point_cloud(_ftracks_without_scene_point,
+  //                                          frame_rgb8, pose_edge, _camera);
   // return false;
 }

From 471059b635422aeeb4380c50f68d238061dd29fc Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Wed, 17 Apr 2024 13:35:22 +0100
Subject: [PATCH 41/49] ENH: inspect the pipeline step by step.

---
 .../visual_odometry_example_v2.cpp            | 53 +++++++++++++++----
 .../Sara/SfM/OdometryV2/OdometryPipeline.cpp  | 16 +++++-
 2 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp b/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
index 0c51d4f60..6063c8e39 100644
--- a/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
+++ b/cpp/examples/Sara/MultiViewGeometry/visual_odometry_example_v2.cpp
@@ -72,6 +72,7 @@ class SingleWindowApp
     glfwSetWindowUserPointer(_window, this);
     // Register callbacks.
     glfwSetWindowSizeCallback(_window, window_size_callback);
+    glfwSetKeyCallback(_window, key_callback);
   }
 
   ~SingleWindowApp()
@@ -115,12 +116,24 @@ class SingleWindowApp
     glfwSwapInterval(1);
     while (!glfwWindowShouldClose(_window))
     {
-      if (!_pipeline.read())
-        break;
+      if (!_pause)
+      {
+        if (!_pipeline.read())
+          break;
 
-      _pipeline.process();
-      // Load data to OpenGL.
-      upload_point_cloud_data_to_opengl();
+        if (!_pipeline._video_streamer.skip())
+        {
+          _pipeline.process();
+
+          // Load data to OpenGL.
+          //
+          // TODO: upload only if we have a new image frame to process and only
+          // if the absolute pose estimation is successful.
+          upload_point_cloud_data_to_opengl();
+
+          _pause = true;
+        }
+      }
 
       // Clear the color buffer and the buffer testing.
       glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
@@ -172,19 +185,22 @@ class SingleWindowApp
   auto upload_point_cloud_data_to_opengl() -> void
   {
     const auto& point_cloud = _pipeline.point_cloud();
+
+    static constexpr auto dim = 6;
+    const auto num_points = static_cast<int>(point_cloud.size());
+    if (num_points == 0)
+      return;
+
     const auto ptr =
         const_cast<sara::PointCloudGenerator::ScenePoint*>(point_cloud.data());
     const auto ptrd = reinterpret_cast<double*>(ptr);
-
-    const auto num_points = static_cast<int>(point_cloud.size());
-    static constexpr auto dim = 6;
     const auto pc_tview = sara::TensorView_<double, 2>{
         ptrd,              //
         {num_points, dim}  //
     };
 
     auto& logger = sara::Logger::get();
-    SARA_LOGW(logger, "point cloud dimensions: {} ", pc_tview.sizes());
+    SARA_LOGI(logger, "point cloud dimensions: {} ", pc_tview.sizes());
     _point_cloud.upload_host_data_to_gl(pc_tview.cast<float>());
   }
 
@@ -271,6 +287,22 @@ class SingleWindowApp
     self._point_cloud_projection = self._point_cloud_viewport.perspective();
   }
 
+  static auto key_callback(GLFWwindow* window,  //
+                           int key,             //
+                           int /* scancode */,  //
+                           int action,          //
+                           int /* mods */) -> void
+  {
+    auto& app = get_self(window);
+    if (app._pause && key == GLFW_KEY_SPACE &&
+        (action == GLFW_RELEASE || action == GLFW_REPEAT))
+    {
+      app._pause = false;
+      std::cout << "RESUME" << std::endl;
+      return;
+    }
+  }
+
 private:
   static auto init_glfw() -> void
   {
@@ -342,6 +374,9 @@ class SingleWindowApp
   Eigen::Matrix4f _point_cloud_projection;
   // kgl::Camera _point_cloud_camera;
   float _point_size = 5.f;
+
+  //! @brief User interaction.
+  bool _pause = false;
 };
 
 bool SingleWindowApp::_glfw_initialized = false;
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
index 644bc8228..9ef89a9b3 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
@@ -52,6 +52,12 @@ auto v2::OdometryPipeline::process() -> void
   if (_video_streamer.skip())
     return;
 
+  auto& logger = Logger::get();
+  SARA_LOGI(logger, "[Video Stream] Processing image frame {}",
+            _video_streamer.frame_number());
+
+  SARA_LOGI(logger, "[Image Distortion] Undistort image frame {}",
+            _video_streamer.frame_number());
   _distortion_corrector->undistort();
 
   grow_geometry();
@@ -68,7 +74,7 @@ auto v2::OdometryPipeline::detect_keypoints(const ImageView<float>& image) const
   auto& logger = Logger::get();
   const auto keys = compute_sift_keypoints(image,  //
                                            _feature_params.image_pyr_params);
-  SARA_LOGI(logger, "[Feature Detection] {} keypoints", features(keys).size());
+  SARA_LOGI(logger, "[Keypoint Detection] {} keypoints", features(keys).size());
   return keys;
 }
 
@@ -124,6 +130,10 @@ auto v2::OdometryPipeline::grow_geometry() -> bool
   const auto frame_number = _video_streamer.frame_number();
   auto keys_curr = detect_keypoints(frame_gray32f);
 
+  // TODO: CHECK EVERYTHING UNTIL HERE.
+  return true;
+
+
   // Boundary case: the graphs are empty.
   if (_pose_graph.num_vertices() == 0)
   {
@@ -176,10 +186,14 @@ auto v2::OdometryPipeline::grow_geometry() -> bool
   std::tie(_tracks_alive, _track_visibility_count) =
       _feature_tracker.calculate_alive_feature_tracks(_pose_curr);
 
+
   // 4. Initialize the point cloud.
   //
   // TODO: don't add 3D scene points that are too far, like point in the
   // sky
+  //
+  // TODO: don't clear next time we just need to debug at this time.
+  _point_cloud.clear();
   const auto frame_rgb8 = _distortion_corrector->frame_rgb8();
   _point_cloud_generator->grow_point_cloud(_tracks_alive, frame_rgb8, pose_edge,
                                            _camera);

From 1fc07d48c8922dd8af547a313e27f9171239ccd0 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Wed, 17 Apr 2024 14:05:40 +0100
Subject: [PATCH 42/49] MAINT: fix bug.

---
 .../BuildingBlocks/PointCloudGenerator.cpp    |  4 +-
 cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp |  7 ++--
 .../Sara/SfM/OdometryV2/OdometryPipeline.cpp  | 40 ++++++++++++++-----
 3 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
index 1dfaf5054..6ef5e789c 100644
--- a/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
+++ b/cpp/src/DO/Sara/SfM/BuildingBlocks/PointCloudGenerator.cpp
@@ -360,7 +360,7 @@ auto PointCloudGenerator::grow_point_cloud(
       rays_u, rays_v);
 
   SARA_LOGD(logger, "Adding new scene points to point cloud...");
-  SARA_LOGD(logger, "[BEFORE] point cloud: {} 3D points", _point_cloud.size());
+  SARA_LOGD(logger, "[BEFORE] {} scene points", _point_cloud.size());
 
   // N.B.: start with the right offset for the scene point index.
   auto scene_point_index = _point_cloud.size();
@@ -390,5 +390,5 @@ auto PointCloudGenerator::grow_point_cloud(
     ++scene_point_index;
   }
 
-  SARA_LOGD(logger, "[AFTER ] point cloud: {} 3D points", _point_cloud.size());
+  SARA_LOGD(logger, "[AFTER ] {} scene points", _point_cloud.size());
 }
diff --git a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
index 37b3b784f..cfd2cf8ab 100644
--- a/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
+++ b/cpp/src/DO/Sara/SfM/Graph/CameraPoseGraph.cpp
@@ -32,9 +32,10 @@ auto CameraPoseGraph::add_absolute_pose(AbsolutePoseData&& data)
   _g[v] = std::move(data);
 
   SARA_LOGI(logger,
-            "[SfM] Added camera absolute pose[frame:{}]:\n"
-            "Keypoints: {} points\n"
-            "Absolute pose: {}\n",             //
+            "[SfM] Added camera absolute pose:\n"
+            "- Frame        : {}\n"
+            "- Keypoints    : {}\n"
+            "- Absolute pose:\n{}\n",          //
             _g[v].image_id,                    //
             features(_g[v].keypoints).size(),  //
             _g[v].pose.matrix34());
diff --git a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
index 9ef89a9b3..6bfec0ae9 100644
--- a/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
+++ b/cpp/src/DO/Sara/SfM/OdometryV2/OdometryPipeline.cpp
@@ -13,11 +13,11 @@
 
 #include <DO/Sara/Logging/Logger.hpp>
 
-#include <DO/Sara/Graphics/ImageDraw.hpp>
-#include <DO/Sara/Visualization/Features/Draw.hpp>
-
+#include <DO/Sara/Core/Math/Rotation.hpp>
 #include <DO/Sara/FeatureDetectors/SIFT.hpp>
+#include <DO/Sara/Graphics/ImageDraw.hpp>
 #include <DO/Sara/SfM/Helpers/KeypointMatching.hpp>
+#include <DO/Sara/Visualization/Features/Draw.hpp>
 
 
 using namespace DO::Sara;
@@ -130,10 +130,6 @@ auto v2::OdometryPipeline::grow_geometry() -> bool
   const auto frame_number = _video_streamer.frame_number();
   auto keys_curr = detect_keypoints(frame_gray32f);
 
-  // TODO: CHECK EVERYTHING UNTIL HERE.
-  return true;
-
-
   // Boundary case: the graphs are empty.
   if (_pose_graph.num_vertices() == 0)
   {
@@ -161,6 +157,7 @@ auto v2::OdometryPipeline::grow_geometry() -> bool
   }
   SARA_LOGI(logger, "[SfM] Relative pose succeeded!");
 
+
   // if (_pose_graph.num_vertices() == 1)
   // {
   auto abs_pose_curr = QuaternionBasedPose<double>{
@@ -168,6 +165,32 @@ auto v2::OdometryPipeline::grow_geometry() -> bool
       .t = rel_pose_data.motion.t  //
   };
 
+  // The rotation is expressed in the camera coordinates.
+  // But the calculation is done in the automotive/aeronautics coordinate
+  // system.
+  //
+  // The z-coordinate of the camera coordinates is the x-axis of the automotive
+  // coordinates
+  //
+  // clang-format off
+  static const auto P = (Eigen::Matrix3d{} <<
+     0,  0, 1,
+    -1,  0, 0,
+     0, -1, 0
+  ).finished();
+  // clang-format on
+
+  const auto& R = rel_pose_data.motion.R;
+  const Eigen::Matrix3d R_delta_abs = P * R.transpose() * P.transpose();
+  _current_global_rotation = R_delta_abs * _current_global_rotation;
+
+  const auto q_global = Eigen::Quaterniond{_current_global_rotation};
+  auto angles = calculate_yaw_pitch_roll(q_global);
+  static constexpr auto degrees = 180. / M_PI;
+  SARA_LOGI(logger, "Global yaw   = {} deg", angles(0) * degrees);
+  SARA_LOGI(logger, "Global pitch = {} deg", angles(1) * degrees);
+  SARA_LOGI(logger, "Global roll  = {} deg", angles(2) * degrees);
+
   auto abs_pose_data = AbsolutePoseData{
       frame_number,             //
       std::move(keys_curr),     //
@@ -175,7 +198,7 @@ auto v2::OdometryPipeline::grow_geometry() -> bool
   };
 
   // 1. Add the absolute pose vertex.
-  _pose_graph.add_absolute_pose(std::move(abs_pose_data));
+  _pose_curr = _pose_graph.add_absolute_pose(std::move(abs_pose_data));
 
   // 2. Add the pose edge, which will invalidate the relative pose data.
   const auto pose_edge = _pose_graph.add_relative_pose(
@@ -186,7 +209,6 @@ auto v2::OdometryPipeline::grow_geometry() -> bool
   std::tie(_tracks_alive, _track_visibility_count) =
       _feature_tracker.calculate_alive_feature_tracks(_pose_curr);
 
-
   // 4. Initialize the point cloud.
   //
   // TODO: don't add 3D scene points that are too far, like point in the

From acc11667027e6ab5cd5e6d70bc21247fc04a2ff2 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Wed, 17 Apr 2024 15:57:05 +0100
Subject: [PATCH 43/49] MAINT: copyright notice, less includes.

---
 .../MinimalSolvers/SevenPointAlgorithm.hpp            |  2 --
 .../MultiViewGeometry/PointCorrespondenceList.hpp     | 11 +++++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/SevenPointAlgorithm.hpp b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/SevenPointAlgorithm.hpp
index ec425d85f..4905aeb9f 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/SevenPointAlgorithm.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/SevenPointAlgorithm.hpp
@@ -6,8 +6,6 @@
 #include <DO/Sara/Core/Tensor.hpp>
 #include <DO/Sara/MultiViewGeometry/Geometry/FundamentalMatrix.hpp>
 
-#include <optional>
-
 
 namespace DO::Sara {
 
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/PointCorrespondenceList.hpp b/cpp/src/DO/Sara/MultiViewGeometry/PointCorrespondenceList.hpp
index 49d0b2dda..17b01e650 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/PointCorrespondenceList.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/PointCorrespondenceList.hpp
@@ -1,3 +1,14 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024 David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
 #pragma once
 
 #include <DO/Sara/Core/Tensor.hpp>

From c12a2355f4d617bee1c1665b2073f662a74b288b Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Wed, 17 Apr 2024 16:14:37 +0100
Subject: [PATCH 44/49] WIP: save work.

---
 .../MinimalSolvers/P3PSolver.hpp              | 96 +++++++++++++++++++
 .../PointRayCorrespondenceList.hpp            | 26 +++++
 2 files changed, 122 insertions(+)
 create mode 100644 cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
 create mode 100644 cpp/src/DO/Sara/MultiViewGeometry/PointRayCorrespondenceList.hpp

diff --git a/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
new file mode 100644
index 000000000..8166f3c0c
--- /dev/null
+++ b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
@@ -0,0 +1,96 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024 David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#pragma once
+
+#include "DO/Sara/Core/Math/UsualFunctions.hpp"
+#include <DO/Sara/MultiViewGeometry/PnP/LambdaTwist.hpp>
+#include <DO/Sara/MultiViewGeometry/PointRayCorrespondenceList.hpp>
+
+
+namespace DO::Sara {
+
+  template <typename T>
+  struct P3PSolver
+  {
+    static constexpr auto num_points = 3;
+    static constexpr auto num_models = 4;
+
+    using data_point_type = TensorView_<T, 2>;
+    using model_type = Eigen::Matrix<T, 3, 4>;
+
+    inline auto operator()(const data_point_type& x) const
+        -> std::vector<model_type>
+    {
+      const Eigen::Matrix3<T> scene_points = x.matrix().leftCols(3);
+      const Eigen::Matrix3<T> backprojected_rays = x.matrix().rightCols(3);
+      return solve_p3p(scene_points, backprojected_rays);
+    }
+  };
+
+  //! @brief Joint cheirality and epipolar consistency for RANSAC.
+  template <typename CameraModel>
+  struct CheiralPnPConsistency
+  {
+    using Model = Eigen::Matrix<double, 3, 4>;
+
+    CameraModel camera;
+    Model pose;
+    double pixel_reprojection_error;
+
+    CheiralPnPConsistency() = default;
+
+    CheiralPnPConsistency(const Model& pose)
+    {
+      set_model(pose);
+    }
+
+    auto set_model(const Model& p) -> void
+    {
+      pose = p;
+    }
+
+    template <typename Derived>
+    auto operator()(const Eigen::MatrixBase<Derived>& scene_points,
+                    const Eigen::MatrixBase<Derived>& rays) const
+        -> Eigen::Array<bool, 1, Eigen::Dynamic>
+    {
+      Eigen::MatrixXd u1n = pose * scene_points;
+      Eigen::MatrixXd u2{rays.rows(), rays.cols()};
+
+      auto u1 = Eigen::MatrixXd{2, scene_points.cols()};
+      for (auto i = 0; i < u1.cols(); ++i)
+        u1.col(i) = camera.project(u1.col(i));
+
+      for (auto i = 0; i < u2.cols(); ++i)
+        u2.col(i) = camera.project(rays.col(i));
+
+      const auto err_max = square(pixel_reprojection_error);
+
+      const auto small_reproj_error =
+          (u2 - u1).colwise().squaredNorm().array() < err_max;
+      const auto cheiral = scene_points.row(2).array() > 0;
+
+      return small_reproj_error && cheiral;
+    }
+
+    //! @brief Check the inlier predicate on a list of correspondences.
+    template <typename T>
+    inline auto operator()(const PointRayCorrespondenceSubsetList<T>& m) const
+        -> Array<bool, 1, Dynamic>
+    {
+      const auto& scene_points = m._p1.colmajor_view().matrix();
+      const auto& backprojected_rays = m._p2.colmajor_view().matrix();
+      return this->operator()(scene_points, backprojected_rays);
+    }
+  };
+
+}  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/PointRayCorrespondenceList.hpp b/cpp/src/DO/Sara/MultiViewGeometry/PointRayCorrespondenceList.hpp
new file mode 100644
index 000000000..e507a49f4
--- /dev/null
+++ b/cpp/src/DO/Sara/MultiViewGeometry/PointRayCorrespondenceList.hpp
@@ -0,0 +1,26 @@
+// ========================================================================== //
+// This file is part of Sara, a basic set of libraries in C++ for computer
+// vision.
+//
+// Copyright (C) 2024 David Ok <david.ok8@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License v. 2.0. If a copy of the MPL was not distributed with this file,
+// you can obtain one at http://mozilla.org/MPL/2.0/.
+// ========================================================================== //
+
+#pragma once
+
+#include "DO/Sara/MultiViewGeometry/PointCorrespondenceList.hpp"
+#include <DO/Sara/Core/Tensor.hpp>
+
+
+namespace DO::Sara {
+
+  template <typename T>
+  using PointRayCorrespondenceList = PointCorrespondenceList<T>;
+
+  template <typename T>
+  using PointRayCorrespondenceSubsetList = PointCorrespondenceSubsetList<T>;
+
+}  // namespace DO::Sara

From 6bd8750d41413225b03827885b8e6b799bf39b8f Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Wed, 17 Apr 2024 17:37:49 +0100
Subject: [PATCH 45/49] WIP: save work.

---
 .../MinimalSolvers/P3PSolver.hpp              | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
index 8166f3c0c..77541f522 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
@@ -42,9 +42,9 @@ namespace DO::Sara {
   {
     using Model = Eigen::Matrix<double, 3, 4>;
 
-    CameraModel camera;
+    const CameraModel* camera = nullptr;
     Model pose;
-    double pixel_reprojection_error;
+    double image_reproj_err_max;
 
     CheiralPnPConsistency() = default;
 
@@ -63,21 +63,26 @@ namespace DO::Sara {
                     const Eigen::MatrixBase<Derived>& rays) const
         -> Eigen::Array<bool, 1, Eigen::Dynamic>
     {
-      Eigen::MatrixXd u1n = pose * scene_points;
-      Eigen::MatrixXd u2{rays.rows(), rays.cols()};
+      if (camera == nullptr)
+        throw std::runtime_error{
+            "Error: you must initialize the intrinsic camera parameters!"};
 
+      Eigen::MatrixXd u1n = pose * scene_points;
       auto u1 = Eigen::MatrixXd{2, scene_points.cols()};
       for (auto i = 0; i < u1.cols(); ++i)
-        u1.col(i) = camera.project(u1.col(i));
+        u1.col(i) = camera->project(u1.col(i));
 
+      Eigen::MatrixXd u2{rays.rows(), rays.cols()};
       for (auto i = 0; i < u2.cols(); ++i)
-        u2.col(i) = camera.project(rays.col(i));
+        u2.col(i) = camera->project(rays.col(i));
 
-      const auto err_max = square(pixel_reprojection_error);
+      // Check the cheirality w.r.t. the candidate pose.
+      const auto cheiral = u1n.row(2).array() > 0;
 
+      // Checka the image reprojection errors.
+      const auto err_max = square(image_reproj_err_max);
       const auto small_reproj_error =
           (u2 - u1).colwise().squaredNorm().array() < err_max;
-      const auto cheiral = scene_points.row(2).array() > 0;
 
       return small_reproj_error && cheiral;
     }

From aeabbc0041405ff78a6aa252f8af51f5c2325acc Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Thu, 18 Apr 2024 13:19:43 +0100
Subject: [PATCH 46/49] MAINT: rename variables for more natural usage.

---
 .../FeatureDetectors/EdgePostProcessing.hpp   |  4 +-
 .../Algorithms/RobustEstimation/PointList.hpp | 10 ++--
 cpp/src/DO/Sara/Geometry/Tools/Normalizer.hpp |  2 +-
 .../MultiViewGeometry/Geometry/Normalizer.hpp | 42 +++++++-------
 .../MinimalSolvers/InlierPredicates.hpp       |  7 ++-
 .../PointCorrespondenceList.hpp               | 57 ++++++++++++-------
 .../PointRayCorrespondenceList.hpp            |  3 +
 cpp/src/DO/Sara/RANSAC/RANSAC.hpp             |  6 +-
 cpp/src/DO/Sara/RANSAC/Utility.hpp            |  6 +-
 ...test_multiviewgeometry_vanishing_point.cpp |  2 +-
 10 files changed, 81 insertions(+), 58 deletions(-)

diff --git a/cpp/src/DO/Sara/FeatureDetectors/EdgePostProcessing.hpp b/cpp/src/DO/Sara/FeatureDetectors/EdgePostProcessing.hpp
index feb02db32..8926c2969 100644
--- a/cpp/src/DO/Sara/FeatureDetectors/EdgePostProcessing.hpp
+++ b/cpp/src/DO/Sara/FeatureDetectors/EdgePostProcessing.hpp
@@ -194,8 +194,8 @@ namespace DO::Sara {
 
     const auto num_curve_points = static_cast<int>(curve_points.size());
     auto point_list = PointList<float, 2>{};
-    point_list._data.resize(num_curve_points, 3);
-    auto& points = point_list._data;
+    point_list.data.resize(num_curve_points, 3);
+    auto& points = point_list.data;
     auto point_matrix = points.matrix();
     for (auto r = 0; r < num_curve_points; ++r)
       point_matrix.row(r) = curve_points[r]      //
diff --git a/cpp/src/DO/Sara/Geometry/Algorithms/RobustEstimation/PointList.hpp b/cpp/src/DO/Sara/Geometry/Algorithms/RobustEstimation/PointList.hpp
index 8204f7f41..1f69d5cdf 100644
--- a/cpp/src/DO/Sara/Geometry/Algorithms/RobustEstimation/PointList.hpp
+++ b/cpp/src/DO/Sara/Geometry/Algorithms/RobustEstimation/PointList.hpp
@@ -23,25 +23,25 @@ namespace DO::Sara {
 
     operator TensorView<T, D>&()
     {
-      return _data;
+      return data;
     }
 
     operator const TensorView<T, D>&() const
     {
-      return _data;
+      return data;
     }
 
     auto operator[](const int n) const -> value_type
     {
-      return _data[n];
+      return data[n];
     }
 
     auto size() const -> int
     {
-      return _data.size(0);
+      return data.size(0);
     }
 
-    Tensor_<T, D> _data;
+    Tensor_<T, D> data;
   };
 
 }  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/Geometry/Tools/Normalizer.hpp b/cpp/src/DO/Sara/Geometry/Tools/Normalizer.hpp
index 3952445a9..080118fff 100644
--- a/cpp/src/DO/Sara/Geometry/Tools/Normalizer.hpp
+++ b/cpp/src/DO/Sara/Geometry/Tools/Normalizer.hpp
@@ -96,7 +96,7 @@ namespace DO::Sara {
     inline auto normalize(const PointList<S, 2>& X) const
     {
       auto Xn = PointList<S, 2>{};
-      Xn._data = apply_transform(T, X._data);
+      Xn.data = apply_transform(T, X.data);
       return Xn;
     }
 
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/Geometry/Normalizer.hpp b/cpp/src/DO/Sara/MultiViewGeometry/Geometry/Normalizer.hpp
index fb32a281a..3e44cafff 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/Geometry/Normalizer.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/Geometry/Normalizer.hpp
@@ -38,8 +38,8 @@ namespace DO::Sara {
     }
 
     Normalizer(const PointCorrespondenceList<double>& matches)
-      : T1{compute_normalizer(matches._p1)}
-      , T2{compute_normalizer(matches._p2)}
+      : T1{compute_normalizer(matches.x)}
+      , T2{compute_normalizer(matches.y)}
     {
       T1_inv = T1.inverse();
       T2_inv = T2.inverse();
@@ -51,12 +51,12 @@ namespace DO::Sara {
       return std::make_tuple(apply_transform(T1, p1), apply_transform(T2, p2));
     }
 
-    auto normalize(const PointCorrespondenceList<double>& X) const
+    auto normalize(const PointCorrespondenceList<double>& M) const
         -> PointCorrespondenceList<double>
     {
-      auto Xn = PointCorrespondenceList<double>{};
-      std::tie(Xn._p1, Xn._p2) = this->normalize(X._p1, X._p2);
-      return Xn;
+      auto Mn = PointCorrespondenceList<double>{};
+      std::tie(Mn.x, Mn.y) = this->normalize(M.x, M.y);
+      return Mn;
     }
 
     inline auto denormalize(Eigen::Matrix3d& H) const -> void
@@ -81,9 +81,9 @@ namespace DO::Sara {
     {
     }
 
-    Normalizer(const PointCorrespondenceList<double>& matches)
-      : T1{compute_normalizer(matches._p1)}
-      , T2{compute_normalizer(matches._p2)}
+    Normalizer(const PointCorrespondenceList<double>& M)
+      : T1{compute_normalizer(M.x)}
+      , T2{compute_normalizer(M.y)}
     {
     }
 
@@ -93,12 +93,12 @@ namespace DO::Sara {
       return std::make_tuple(apply_transform(T1, p1), apply_transform(T2, p2));
     }
 
-    auto normalize(const PointCorrespondenceList<double>& X) const
+    auto normalize(const PointCorrespondenceList<double>& M) const
         -> PointCorrespondenceList<double>
     {
-      auto Xn = PointCorrespondenceList<double>{};
-      std::tie(Xn._p1, Xn._p2) = this->normalize(X._p1, X._p2);
-      return Xn;
+      auto Mn = PointCorrespondenceList<double>{};
+      std::tie(Mn.x, Mn.y) = this->normalize(M.x, M.y);
+      return Mn;
     }
 
     auto denormalize(Eigen::Matrix3d& F) const -> void
@@ -133,12 +133,12 @@ namespace DO::Sara {
       return std::make_tuple(p1n, p2n);
     }
 
-    auto normalize(const PointCorrespondenceList<double>& X) const
+    auto normalize(const PointCorrespondenceList<double>& M) const
         -> PointCorrespondenceList<double>
     {
-      auto Xn = PointCorrespondenceList<double>{};
-      std::tie(Xn._p1, Xn._p2) = this->normalize(X._p1, X._p2);
-      return Xn;
+      auto Mn = PointCorrespondenceList<double>{};
+      std::tie(Mn.x, Mn.y) = this->normalize(M.x, M.y);
+      return Mn;
     }
 
     //! @brief Dummy implementation.
@@ -172,12 +172,12 @@ namespace DO::Sara {
       return std::make_tuple(p1n, p2n);
     }
 
-    auto normalize(const PointCorrespondenceList<double>& X) const
+    auto normalize(const PointCorrespondenceList<double>& M) const
         -> PointCorrespondenceList<double>
     {
-      auto Xn = PointCorrespondenceList<double>{};
-      std::tie(Xn._p1, Xn._p2) = this->normalize(X._p1, X._p2);
-      return Xn;
+      auto Mn = PointCorrespondenceList<double>{};
+      std::tie(Mn.x, Mn.y) = this->normalize(M.x, M.y);
+      return Mn;
     }
 
     //! @brief Dummy implementation.
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/InlierPredicates.hpp b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/InlierPredicates.hpp
index 3cd5a4c03..8451ccdb4 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/InlierPredicates.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/InlierPredicates.hpp
@@ -43,7 +43,8 @@ namespace DO::Sara {
       // THE BUG IS HERE: we need to pass backprojected rays and instead we are
       // giving pixel coordinates.
       const auto [X, s1, s2] = triangulate_linear_eigen(P1, P2, u1, u2);
-      const auto cheirality = (s1.transpose().array()) > 0 && (s2.transpose().array() > 0);
+      const auto cheirality =
+          (s1.transpose().array()) > 0 && (s2.transpose().array() > 0);
 
       return epipolar_consistent && cheirality;
 #else
@@ -57,8 +58,8 @@ namespace DO::Sara {
     inline auto operator()(const PointCorrespondenceList<T>& m) const
         -> Array<bool, 1, Dynamic>
     {
-      return this->operator()(m._p1.colmajor_view().matrix(),
-                              m._p2.colmajor_view().matrix());
+      return this->operator()(m.x.colmajor_view().matrix(),
+                              m.y.colmajor_view().matrix());
     }
   };
 
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/PointCorrespondenceList.hpp b/cpp/src/DO/Sara/MultiViewGeometry/PointCorrespondenceList.hpp
index 17b01e650..08a5e8864 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/PointCorrespondenceList.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/PointCorrespondenceList.hpp
@@ -16,6 +16,22 @@
 
 namespace DO::Sara {
 
+  //! @brief The data structure to store the list of point correspondences.
+  //!
+  //! A point correspondence is denoted as $(x[i], y[i])$.
+  //!
+  //! In my opinion, 'x' and 'y' are the lightest and most agnostic notations I
+  //! could find.
+  //! We do lose expressivity because they may be too neutral, but it still
+  //! feels natural and mathematical at the same time.
+  //!
+  //! Notice that we are not assuming anythin about the dimensions of x[i] and
+  //! y[i].
+  //! - x[i] and y[i] don't have to have the same dimensions.
+  //! - (x[i], y[i]) can be 2D point <-> 2D point correspondence in two
+  //!   different images, or
+  //! - (x[i], y[i]) can be a 3D point <-> 2D point correspondence, or
+  //! - (x[i], y[i]) can be a 3D scene point <-> 3D ray correspondence.
   template <typename T>
   struct PointCorrespondenceList
   {
@@ -24,37 +40,40 @@ namespace DO::Sara {
     PointCorrespondenceList() = default;
 
     PointCorrespondenceList(const TensorView_<int, 2>& M,
-                            const TensorView_<T, 2>& p1,
-                            const TensorView_<T, 2>& p2)
-      : _p1{M.size(0), p1.size(1)}
-      , _p2{M.size(0), p2.size(1)}
+                            const TensorView_<T, 2>& x_all,
+                            const TensorView_<T, 2>& y_all)
+      : x{M.size(0), x_all.size(1)}
+      , y{M.size(0), y_all.size(1)}
     {
-      auto p1_mat = p1.matrix();
-      auto p2_mat = p2.matrix();
-      auto p1_matched = _p1.matrix();
-      auto p2_matched = _p2.matrix();
+      auto x_all_mat = x_all.matrix();
+      auto y_all_mat = y_all.matrix();
+      auto x_matched = x.matrix();
+      auto y_matched = y.matrix();
       for (auto m = 0; m < M.size(0); ++m)
       {
-        const auto& i1 = M(m, 0);
-        const auto& i2 = M(m, 1);
+        const auto& x_idx = M(m, 0);
+        const auto& y_idx = M(m, 1);
 
-        p1_matched.row(m) = p1_mat.row(i1);
-        p2_matched.row(m) = p2_mat.row(i2);
+        x_matched.row(m) = x_all_mat.row(x_idx);
+        y_matched.row(m) = y_all_mat.row(y_idx);
       }
     }
 
     auto size() const -> int
     {
-      return _p1.size(0);
+      return x.size(0);
     }
 
     auto operator[](const int n) const -> value_type
     {
-      return {_p1[n], _p2[n]};
+      return {x[n], y[n]};
     }
 
-    Tensor_<T, 2> _p1;
-    Tensor_<T, 2> _p2;
+    //! @brief The correspondences are: (x[i], y[i]).
+    //! @{
+    Tensor_<T, 2> x;
+    Tensor_<T, 2> y;
+    //! @}
   };
 
   template <typename T>
@@ -64,11 +83,11 @@ namespace DO::Sara {
 
     auto operator[](const int n) const -> value_type
     {
-      return {_p1[n], _p2[n]};
+      return {x[n], y[n]};
     }
 
-    Tensor_<T, 3> _p1;
-    Tensor_<T, 3> _p2;
+    Tensor_<T, 3> x;
+    Tensor_<T, 3> y;
   };
 
 }  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/PointRayCorrespondenceList.hpp b/cpp/src/DO/Sara/MultiViewGeometry/PointRayCorrespondenceList.hpp
index e507a49f4..e166d7a02 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/PointRayCorrespondenceList.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/PointRayCorrespondenceList.hpp
@@ -17,10 +17,13 @@
 
 namespace DO::Sara {
 
+  //! @brief Convenient aliases.
+  //! @{
   template <typename T>
   using PointRayCorrespondenceList = PointCorrespondenceList<T>;
 
   template <typename T>
   using PointRayCorrespondenceSubsetList = PointCorrespondenceSubsetList<T>;
+  //! @}
 
 }  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/RANSAC/RANSAC.hpp b/cpp/src/DO/Sara/RANSAC/RANSAC.hpp
index 986dfd263..124af43a3 100644
--- a/cpp/src/DO/Sara/RANSAC/RANSAC.hpp
+++ b/cpp/src/DO/Sara/RANSAC/RANSAC.hpp
@@ -151,7 +151,7 @@ namespace DO::Sara {
     inline auto operator()(const PointList<T, D>& X) const
         -> Array<bool, 1, Dynamic>
     {
-      return distance(X._data.colmajor_view().matrix()).array() <
+      return distance(X.data.colmajor_view().matrix()).array() <
              static_cast<float>(err_threshold);
     }
 
@@ -160,8 +160,8 @@ namespace DO::Sara {
     inline auto operator()(const PointCorrespondenceList<T>& m) const
         -> Array<bool, 1, Dynamic>
     {
-      return distance(m._p1.colmajor_view().matrix(),
-                      m._p2.colmajor_view().matrix())
+      return distance(m.x.colmajor_view().matrix(),
+                      m.y.colmajor_view().matrix())
                  .array() < err_threshold;
     }
   };
diff --git a/cpp/src/DO/Sara/RANSAC/Utility.hpp b/cpp/src/DO/Sara/RANSAC/Utility.hpp
index 18e531e60..55a179fb7 100644
--- a/cpp/src/DO/Sara/RANSAC/Utility.hpp
+++ b/cpp/src/DO/Sara/RANSAC/Utility.hpp
@@ -62,7 +62,7 @@ namespace DO::Sara {
   auto from_index_to_point(const TensorView_<int, 2>& point_indices,
                            const PointList<T, D>& points) -> Tensor_<T, D + 1>
   {
-    return from_index_to_point(point_indices, points._data);
+    return from_index_to_point(point_indices, points.data);
   };
 
   template <typename T>
@@ -71,8 +71,8 @@ namespace DO::Sara {
       -> PointCorrespondenceSubsetList<T>
   {
     auto res = PointCorrespondenceSubsetList<T>{};
-    res._p1 = from_index_to_point(point_indices, correspondences._p1);
-    res._p2 = from_index_to_point(point_indices, correspondences._p2);
+    res.x = from_index_to_point(point_indices, correspondences.x);
+    res.y = from_index_to_point(point_indices, correspondences.y);
     return res;
   };
 
diff --git a/cpp/test/Sara/MultiViewGeometry/test_multiviewgeometry_vanishing_point.cpp b/cpp/test/Sara/MultiViewGeometry/test_multiviewgeometry_vanishing_point.cpp
index 91cdca9b3..6c4c7bb1c 100644
--- a/cpp/test/Sara/MultiViewGeometry/test_multiviewgeometry_vanishing_point.cpp
+++ b/cpp/test/Sara/MultiViewGeometry/test_multiviewgeometry_vanishing_point.cpp
@@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE(test_vp_detection)
   const auto vp = Eigen::Vector3f(500, 500, 1);
 
   auto line_list = PointList<float, 2>{};
-  auto& lines = line_list._data;
+  auto& lines = line_list.data;
 
   lines = Tensor_<float, 2>{6, 3};
   auto lines_as_matrix = lines.matrix();

From d7143d86ffce687c4209efd65e2c487806301e59 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Thu, 18 Apr 2024 13:22:58 +0100
Subject: [PATCH 47/49] MAINT: fix compile errors.

---
 .../MultiViewGeometry/orthogonal_vanishing_point_detection.cpp  | 2 +-
 cpp/test/Sara/RANSAC/test_ransac_line_fit.cpp                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/examples/Sara/MultiViewGeometry/orthogonal_vanishing_point_detection.cpp b/cpp/examples/Sara/MultiViewGeometry/orthogonal_vanishing_point_detection.cpp
index 5e4d49d09..2fa26e34e 100644
--- a/cpp/examples/Sara/MultiViewGeometry/orthogonal_vanishing_point_detection.cpp
+++ b/cpp/examples/Sara/MultiViewGeometry/orthogonal_vanishing_point_detection.cpp
@@ -354,7 +354,7 @@ int sara_graphics_main(int argc, char** argv)
     tic();
     const Eigen::MatrixXf lines_as_matrix = lines.matrix().transpose();
     auto plane_list = PointList<float, 2>{};
-    auto& plane_tensor = plane_list._data;
+    auto& plane_tensor = plane_list.data;
     plane_tensor.resize(static_cast<int>(line_segments.size()), 4);
     plane_tensor.colmajor_view().matrix() = (Pt * lines_as_matrix)  //
                                                 .colwise()          //
diff --git a/cpp/test/Sara/RANSAC/test_ransac_line_fit.cpp b/cpp/test/Sara/RANSAC/test_ransac_line_fit.cpp
index 2781ceafc..617e7d77e 100644
--- a/cpp/test/Sara/RANSAC/test_ransac_line_fit.cpp
+++ b/cpp/test/Sara/RANSAC/test_ransac_line_fit.cpp
@@ -27,7 +27,7 @@ BOOST_AUTO_TEST_CASE(test_robust_line_fit)
 {
   auto points = PointList<double, 2>{Tensor_<double, 2>{6, 3}};
   // clang-format off
-  points._data.matrix() <<              0.00,                      0.00, 1,
+  points.data.matrix() <<               0.00,                      0.00, 1,
                                         1.00,                      1.10, 1,
                                         3.40,                      3.46, 1,
                                         9.80,                     10.10, 1,

From 2a7ca1e63dc46d03782b3e1179a87659aae77f47 Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Mon, 22 Apr 2024 12:51:08 +0100
Subject: [PATCH 48/49] MAINT: rewview code.

---
 .../MinimalSolvers/P3PSolver.hpp              | 45 ++++++++++---------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
index 77541f522..5c3fadf76 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
@@ -40,22 +40,26 @@ namespace DO::Sara {
   template <typename CameraModel>
   struct CheiralPnPConsistency
   {
-    using Model = Eigen::Matrix<double, 3, 4>;
+    using PoseMatrix = Eigen::Matrix<double, 3, 4>;
+    using Model = PoseMatrix;
 
+    //! @brief The camera model for the image.
     const CameraModel* camera = nullptr;
-    Model pose;
-    double image_reproj_err_max;
+    //! @brief The pose matrix.
+    PoseMatrix T;
+    //! @brief Image reprojection error in pixel.
+    double ε;
 
-    CheiralPnPConsistency() = default;
+    inline CheiralPnPConsistency() = default;
 
-    CheiralPnPConsistency(const Model& pose)
+    inline CheiralPnPConsistency(const PoseMatrix& pose_matrix)
     {
-      set_model(pose);
+      set_model(pose_matrix);
     }
 
-    auto set_model(const Model& p) -> void
+    inline auto set_model(const PoseMatrix& pose_matrix) -> void
     {
-      pose = p;
+      T = pose_matrix;
     }
 
     template <typename Derived>
@@ -67,33 +71,34 @@ namespace DO::Sara {
         throw std::runtime_error{
             "Error: you must initialize the intrinsic camera parameters!"};
 
-      Eigen::MatrixXd u1n = pose * scene_points;
+      const auto& X_world = scene_points;
+      const Eigen::MatrixXd X_camera = T * X_world;
+
       auto u1 = Eigen::MatrixXd{2, scene_points.cols()};
       for (auto i = 0; i < u1.cols(); ++i)
-        u1.col(i) = camera->project(u1.col(i));
+        u1.col(i) = camera->project(X_camera.col(i));
 
-      Eigen::MatrixXd u2{rays.rows(), rays.cols()};
+      auto u2 = Eigen::MatrixXd{rays.rows(), rays.cols()};
       for (auto i = 0; i < u2.cols(); ++i)
         u2.col(i) = camera->project(rays.col(i));
 
       // Check the cheirality w.r.t. the candidate pose.
-      const auto cheiral = u1n.row(2).array() > 0;
+      const auto cheiral = X_camera.row(2).array() > 0;
 
-      // Checka the image reprojection errors.
-      const auto err_max = square(image_reproj_err_max);
-      const auto small_reproj_error =
-          (u2 - u1).colwise().squaredNorm().array() < err_max;
+      // Check the **squared** image reprojection errors.
+      const auto ε_max = square(ε);
+      const auto ε_small = (u2 - u1).colwise().squaredNorm().array() < ε_max;
 
-      return small_reproj_error && cheiral;
+      return ε_small && cheiral;
     }
 
     //! @brief Check the inlier predicate on a list of correspondences.
     template <typename T>
     inline auto operator()(const PointRayCorrespondenceSubsetList<T>& m) const
-        -> Array<bool, 1, Dynamic>
+        -> Eigen::Array<bool, 1, Eigen::Dynamic>
     {
-      const auto& scene_points = m._p1.colmajor_view().matrix();
-      const auto& backprojected_rays = m._p2.colmajor_view().matrix();
+      const auto& scene_points = m.x.colmajor_view().matrix();
+      const auto& backprojected_rays = m.y.colmajor_view().matrix();
       return this->operator()(scene_points, backprojected_rays);
     }
   };

From 51c865a45faabd0e95308073ed504b82e536c1dc Mon Sep 17 00:00:00 2001
From: David OK <david.ok8@gmail.com>
Date: Wed, 24 Apr 2024 17:18:10 +0100
Subject: [PATCH 49/49] ENH: add robust estimation API for the PnP problem.

---
 cpp/src/DO/Sara/Core/PhysicalQuantities.hpp   |  2 -
 .../MultiViewGeometry/Geometry/Normalizer.hpp | 30 ++++++++++-
 .../Geometry/PinholeCamera.hpp                |  2 -
 .../MinimalSolvers/P3PSolver.hpp              | 50 +++++++++++++++----
 cpp/src/DO/Sara/RANSAC/RANSAC.hpp             |  4 +-
 cpp/src/DO/Sara/RANSAC/RANSACv2.hpp           |  8 +--
 .../SyntheticDataUtilities.hpp                |  7 ++-
 cpp/test/Sara/RANSAC/CMakeLists.txt           |  2 +-
 8 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/cpp/src/DO/Sara/Core/PhysicalQuantities.hpp b/cpp/src/DO/Sara/Core/PhysicalQuantities.hpp
index 39f2d4d43..bb2be3837 100644
--- a/cpp/src/DO/Sara/Core/PhysicalQuantities.hpp
+++ b/cpp/src/DO/Sara/Core/PhysicalQuantities.hpp
@@ -252,8 +252,6 @@ namespace DO::Sara {
     {
     }
 
-    auto operator=(const self_type&) -> self_type& = default;
-
     inline constexpr operator scalar_type() const
     {
       return value;
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/Geometry/Normalizer.hpp b/cpp/src/DO/Sara/MultiViewGeometry/Geometry/Normalizer.hpp
index 3e44cafff..92043f845 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/Geometry/Normalizer.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/Geometry/Normalizer.hpp
@@ -25,6 +25,8 @@ namespace DO::Sara {
   //! @ingroup GeometryDataNormalizer
 
   //! @{
+
+  //! @brief Normalizer for the two-view homography estimation.
   template <>
   struct Normalizer<Homography>
   {
@@ -70,7 +72,7 @@ namespace DO::Sara {
     Eigen::Matrix3d T2_inv;
   };
 
-
+  //! @brief Normalizer for the two-view fundamental matrix estimation.
   template <>
   struct Normalizer<FundamentalMatrix>
   {
@@ -150,6 +152,7 @@ namespace DO::Sara {
     Eigen::Matrix3d K2_inv;
   };
 
+  //! @brief Normalizer for the two-view relative pose estimation.
   template <>
   struct Normalizer<TwoViewGeometry>
   {
@@ -189,6 +192,31 @@ namespace DO::Sara {
     Eigen::Matrix3d K2_inv;
   };
 
+  //! @brief Normalizer for the PnP estimation.
+  template <>
+  struct Normalizer<Eigen::Matrix<double, 3, 4>>
+  {
+    using PoseMatrix = Eigen::Matrix<double, 3, 4>;
+
+    Normalizer() = default;
+
+    auto normalize(const TensorView_<double, 2>& scene_points,
+                   const TensorView_<double, 2>& backprojected_rays) const
+    {
+      return std::make_tuple(scene_points, backprojected_rays);
+    }
+
+    auto normalize(const PointCorrespondenceList<double>& M) const
+        -> PointCorrespondenceList<double>
+    {
+      return M;
+    }
+
+    auto denormalize(const PoseMatrix&) const -> void
+    {
+    }
+  };
+
   //! @}
 
 } /* namespace DO::Sara */
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/Geometry/PinholeCamera.hpp b/cpp/src/DO/Sara/MultiViewGeometry/Geometry/PinholeCamera.hpp
index 848013e01..7ecdbfe16 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/Geometry/PinholeCamera.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/Geometry/PinholeCamera.hpp
@@ -11,8 +11,6 @@
 
 #pragma once
 
-#include <DO/Sara/Defines.hpp>
-
 #include <DO/Sara/Core/EigenExtension.hpp>
 #include <DO/Sara/MultiViewGeometry/Geometry/EssentialMatrix.hpp>
 
diff --git a/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
index 5c3fadf76..07d4c4a2b 100644
--- a/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
+++ b/cpp/src/DO/Sara/MultiViewGeometry/MinimalSolvers/P3PSolver.hpp
@@ -2,7 +2,7 @@
 // This file is part of Sara, a basic set of libraries in C++ for computer
 // vision.
 //
-// Copyright (C) 2024 David Ok <david.ok8@gmail.com>
+// Copyright (C) 2024-present David Ok <david.ok8@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla Public
 // License v. 2.0. If a copy of the MPL was not distributed with this file,
@@ -11,7 +11,7 @@
 
 #pragma once
 
-#include "DO/Sara/Core/Math/UsualFunctions.hpp"
+#include <DO/Sara/Core/Math/UsualFunctions.hpp>
 #include <DO/Sara/MultiViewGeometry/PnP/LambdaTwist.hpp>
 #include <DO/Sara/MultiViewGeometry/PointRayCorrespondenceList.hpp>
 
@@ -24,15 +24,28 @@ namespace DO::Sara {
     static constexpr auto num_points = 3;
     static constexpr auto num_models = 4;
 
-    using data_point_type = TensorView_<T, 2>;
+    using tensor_view_type = TensorView_<T, 2>;
+    using data_point_type = std::array<TensorView_<T, 2>, 2>;
     using model_type = Eigen::Matrix<T, 3, 4>;
 
-    inline auto operator()(const data_point_type& x) const
+    inline auto operator()(const tensor_view_type& scene_points,
+                           const tensor_view_type& rays) const
         -> std::vector<model_type>
     {
-      const Eigen::Matrix3<T> scene_points = x.matrix().leftCols(3);
-      const Eigen::Matrix3<T> backprojected_rays = x.matrix().rightCols(3);
-      return solve_p3p(scene_points, backprojected_rays);
+      const auto sp_mat_ = scene_points.colmajor_view().matrix();
+
+      Eigen::Matrix3<T> sp_mat = sp_mat_.topRows(3);
+      if (sp_mat_.cols() == 4)
+        sp_mat.array().rowwise() /= sp_mat_.array().row(3);
+
+      const Eigen::Matrix3<T> ray_mat = rays.colmajor_view().matrix();
+      return solve_p3p(sp_mat, ray_mat);
+    }
+
+    inline auto operator()(const data_point_type& X) -> std::vector<model_type>
+    {
+      const auto& [scene_points, backprojected_rays] = X;
+      return this->operator()(scene_points, backprojected_rays);
     }
   };
 
@@ -47,7 +60,7 @@ namespace DO::Sara {
     const CameraModel* camera = nullptr;
     //! @brief The pose matrix.
     PoseMatrix T;
-    //! @brief Image reprojection error in pixel.
+    //! @brief Image reprojection error in pixels.
     double ε;
 
     inline CheiralPnPConsistency() = default;
@@ -71,14 +84,26 @@ namespace DO::Sara {
         throw std::runtime_error{
             "Error: you must initialize the intrinsic camera parameters!"};
 
+      if (scene_points.cols() != rays.cols())
+        throw std::runtime_error{
+            "Error: the number of scene points and rays must be equal!"};
+
       const auto& X_world = scene_points;
-      const Eigen::MatrixXd X_camera = T * X_world;
+      auto X_camera = Eigen::MatrixXd{};
+      if (X_world.rows() == 3)
+        X_camera = T * X_world.colwise().homogeneous();
+      else if (X_world.rows() == 4)
+        X_camera = T * X_world;
+      else
+        throw std::runtime_error{
+            "The dimension of scene points is incorrect. They must either 3D "
+            "(Euclidean) or 4D (homogeneous)!"};
 
       auto u1 = Eigen::MatrixXd{2, scene_points.cols()};
       for (auto i = 0; i < u1.cols(); ++i)
         u1.col(i) = camera->project(X_camera.col(i));
 
-      auto u2 = Eigen::MatrixXd{rays.rows(), rays.cols()};
+      auto u2 = Eigen::MatrixXd{2, rays.cols()};
       for (auto i = 0; i < u2.cols(); ++i)
         u2.col(i) = camera->project(rays.col(i));
 
@@ -94,7 +119,7 @@ namespace DO::Sara {
 
     //! @brief Check the inlier predicate on a list of correspondences.
     template <typename T>
-    inline auto operator()(const PointRayCorrespondenceSubsetList<T>& m) const
+    inline auto operator()(const PointRayCorrespondenceList<T>& m) const
         -> Eigen::Array<bool, 1, Eigen::Dynamic>
     {
       const auto& scene_points = m.x.colmajor_view().matrix();
@@ -103,4 +128,7 @@ namespace DO::Sara {
     }
   };
 
+
+  //! @}
+
 }  // namespace DO::Sara
diff --git a/cpp/src/DO/Sara/RANSAC/RANSAC.hpp b/cpp/src/DO/Sara/RANSAC/RANSAC.hpp
index 124af43a3..fe6250bd8 100644
--- a/cpp/src/DO/Sara/RANSAC/RANSAC.hpp
+++ b/cpp/src/DO/Sara/RANSAC/RANSAC.hpp
@@ -186,9 +186,9 @@ namespace DO::Sara {
                                  double confidence = 0.99) -> std::uint64_t
   {
     // Check the range of values...
-    if (!(0 <= inlier_ratio && inlier_ratio < 1))
+    if (!(0 <= inlier_ratio && inlier_ratio <= 1))
       throw std::runtime_error{
-          "Error: the inlier ratio must be in the open interval [0, 1["};
+          "Error: the inlier ratio must be in the open interval [0, 1]"};
     if (!(0 <= confidence && confidence < 1))
       throw std::runtime_error{
           "Error: the confidence value must be in the open interval [0, 1["};
diff --git a/cpp/src/DO/Sara/RANSAC/RANSACv2.hpp b/cpp/src/DO/Sara/RANSAC/RANSACv2.hpp
index cb4fd1a16..6a7c377ee 100644
--- a/cpp/src/DO/Sara/RANSAC/RANSACv2.hpp
+++ b/cpp/src/DO/Sara/RANSAC/RANSACv2.hpp
@@ -13,6 +13,8 @@
 
 #include <DO/Sara/RANSAC/RANSAC.hpp>
 
+#include <DO/Sara/Core/DebugUtilities.hpp>
+
 
 namespace DO::Sara::v2 {
 
@@ -100,9 +102,9 @@ namespace DO::Sara::v2 {
           inliers_best.flat_array() = inliers;
           subset_best = minimal_index_subsets[n];
 
-          //
-          inlier_ratio_current =
-              num_inliers / static_cast<double>(data_points.size());
+          inlier_ratio_current = std::clamp(
+              num_inliers / static_cast<double>(data_points.size()),  //
+              0., 1.);
           update_num_iterations();
 
           if (verbose)
diff --git a/cpp/test/Sara/MultiViewGeometry/SyntheticDataUtilities.hpp b/cpp/test/Sara/MultiViewGeometry/SyntheticDataUtilities.hpp
index 7798a696c..47103ea36 100644
--- a/cpp/test/Sara/MultiViewGeometry/SyntheticDataUtilities.hpp
+++ b/cpp/test/Sara/MultiViewGeometry/SyntheticDataUtilities.hpp
@@ -34,7 +34,8 @@ inline auto make_cube_vertices()
   return cube;
 }
 
-inline auto make_planar_chessboard_corners(int rows, int cols, double square_size)
+inline auto make_planar_chessboard_corners(int rows, int cols,
+                                           double square_size)
 {
   auto corners = Eigen::MatrixXd{4, rows * cols};
   for (auto y = 0; y < rows; ++y)
@@ -47,10 +48,8 @@ inline auto make_planar_chessboard_corners(int rows, int cols, double square_siz
 inline auto make_relative_motion(double x = 0.1, double y = 0.3, double z = 0.2)
     -> DO::Sara::Motion
 {
-  using namespace DO::Sara;
-
   // Euler composite rotation.
-  const Eigen::Matrix3d R = rotation(z, y, x);
+  const Eigen::Matrix3d R = DO::Sara::rotation(z, y, x);
   // - The axes of the world coordinate system has turned by the following
   //   rotational quantity.
   // - The columns of R are the vector coordinates of the world axes w.r.t.
diff --git a/cpp/test/Sara/RANSAC/CMakeLists.txt b/cpp/test/Sara/RANSAC/CMakeLists.txt
index 8dee67c92..3a76f3b67 100644
--- a/cpp/test/Sara/RANSAC/CMakeLists.txt
+++ b/cpp/test/Sara/RANSAC/CMakeLists.txt
@@ -5,6 +5,6 @@ foreach (file ${test_SOURCE_FILES})
   sara_add_test(
     NAME ${filename}
     SOURCES ${file}
-    DEPENDENCIES DO::Sara::MultiViewGeometry DO::Sara::RANSAC
+    DEPENDENCIES DO::Sara::MultiViewGeometry DO::Sara::RANSAC fmt::fmt
     FOLDER RANSAC)
 endforeach ()