index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="AugInsert: Learning Robust Visual-Force Policies via Data Augmentation for Object Assembly Tasks">
  <meta name="keywords" content="Contact-Rich Manipulation, Imitation Learning, Data Augmentation">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>AugInsert</title>

  <!-- <script async src="https://www.googletagmanager.com/gtag/js?id=G-FV4ZJ9PVSV"></script>
  <script>
    window.dataLayer = window.dataLayer || [];
    function gtag(){dataLayer.push(arguments);}
    gtag('js', new Date());

    gtag('config', 'G-FV4ZJ9PVSV');
  </script> -->

  <script>
    function updateRealWorldVideo() {
      var variation = document.getElementById("task-variation").value;
      var video = document.getElementById("real-world-canonical-video");
      video.src = "media/results/real_world/" + 
                  "real_world_" +
                  variation +
                  ".mp4"
      video.playbackRate = 1.0;
      video.play();
    }

    function updateAttnVisVideo() {
      var attnVisVariation = document.getElementById("task-variation-attnvis").value;
      var video = document.getElementById("attnvis-video");
      video.src = "media/results/attn_vis/" + 
                  attnVisVariation +
                  '_attnvis' +
                  ".mp4"
      video.playbackRate = 1.0;
      video.play();
    }

  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body onload="updateRealWorldVideo(); updateAttnVisVideo();">

<nav class="navbar" role="navigation" aria-label="main navigation">
  <div class="navbar-brand">
    <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
    </a>
  </div>
  <div class="navbar-menu">
    <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
      <a class="navbar-item" target="_blank" href="https://ryangdiaz.github.io">
      <span class="icon">
          <i class="fas fa-home"></i>
      </span>
      </a>

      <div class="navbar-item has-dropdown is-hoverable">
        <a class="navbar-link">
          More Research
        </a>
        <div class="navbar-dropdown">
          <a class="navbar-item" target="_blank" href="https://sites.google.com/view/geometric-peg-in-hole">
            Geometric Peg-in-Hole
          </a>
        </div>
      </div>
    </div>

  </div>
</nav>

<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">AugInsert: Learning Robust Visual-Force Policies via Data Augmentation for Object Assembly Tasks </h1>
          <!-- <h3 class="title is-4 conference-authors">In Review</h3> -->
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a target="_blank" href="https://ryangdiaz.github.io">Ryan Diaz</a><sup>1</sup>,</span>
            <span class="author-block">
              Adam Imdieke<sup>1</sup>,</span>
              <!-- <a target="_blank">Adam Imdieke</a><sup>1</sup>,</span> -->
            <span class="author-block">
              <a target="_blank" href="https://uk.linkedin.com/in/vivkjv?challengeId=AQG2hYTlKNvSfgAAAZJn0G9kIMKEUJrV9seua7v50qrxagyTbcx-IKbgDMOOmSBOBCBRQJuNhDqwv0Ew9Gmf9_lax0aK2_Pu2Q&submissionId=3d87fe1f-9438-fc17-f9ad-445166baba51&challengeSource=AgGFQNWOIp7OrQAAAZJn0Hc5q4aBOEUdjS8KFr70hgFKqSaeGG5TSJQzTEWgXLY&challegeType=AgHrX4EpDIW4CgAAAZJn0Hc8sJx6fvobvF2x1gbfALFcqJ2QMq1TwLA&memberId=AgEZ9eO9oHVbzQAAAZJn0HdBhXx60YlsQVN3Ohy85LuNAYc&recognizeDevice=AgHHEpZk1f0w7QAAAZJn0HdGe0TWh6yAhQXJonFwXMUPascvv_v5">Vivek Veeriah</a><sup>2</sup>,</span>
            <span class="author-block">
              <a target="_blank" href="https://karthikdesingh.com">Karthik Desingh</a><sup>1</sup>
            </span>
          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>1</sup>University of Minnesota - Twin Cities,</span>
            <span class="author-block"><sup>2</sup>Google DeepMind</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a target="_blank" href="paper/auginsert_icra2025.pdf"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span>

            <!-- Arxiv Link. -->
            <span class="link-block">
              <a target="_blank" href="https://arxiv.org/abs/2410.14968"
                 class="external-link button is-normal is-rounded is-dark">
                <span class="icon">
                    <i class="fas fa-file"></i>
                </span>
                <span>ArXiv</span>
              </a>
            </span>

            <!-- Video Link. -->
            <span class="link-block">
              <a target="_blank" href="https://www.youtube.com/watch?v=UTA7sefgs2o"
                 class="external-link button is-normal is-rounded is-dark">
                <span class="icon">
                    <i class="fab fa-youtube"></i>
                </span>
                <span>Video</span>
              </a>
            </span>

            <!-- Talk Link. -->
            <!-- <span class="link-block">
              <a target="_blank" href="https://youtu.be/QcuXwmQgurE"
                 class="external-link button is-normal is-rounded is-dark">
                <span class="icon">
                    <i class="fas fa-chalkboard-teacher"></i>
                </span>
                <span>Talk</span>
              </a>
            </span> -->

            <!-- Code Link. -->
            <span class="link-block">
              <a target="_blank" href="https://github.com/RyangDiaz/auginsert"
                 class="external-link button is-normal is-rounded is-dark">
                <span class="icon">
                    <i class="fab fa-github"></i>
                </span>
                <span>Code</span>
                </a>
            </span>

            </div>
          </div>
          <br>
          <br>

        </div>
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">

    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            This paper primarily focuses on learning robust visual-force policies in the context of high-precision object assembly
            tasks. Specifically, we focus on the <i>contact phase</i> of the assembly task where both objects (peg and hole) have made 
            contact and the objective lies in maneuvering the objects to complete the assembly. Moreover, we aim to learn contact-rich
            manipulation policies with multisensory inputs on limited expert data by expanding human demonstrations via online data
            augmentation.
          </p>
          <p>
            We develop a simulation environment with a dual-arm robot manipulator to evaluate the effect of augmented expert demonstration
            data. Our focus is on evaluating the robustness of our model with respect to certain task variations: <strong>grasp pose, peg/hole 
            shape object body shape, scene appearance, camera pose</strong>, and <strong>force-torque/proprioception noise</strong>. We show that 
            our proposed data augmentation method helps in learning a multisensory manipulation policy that is robust to unseen instances of these 
            variations, particularly physical variations such as <strong>grasp pose</strong>. Additionally, our ablative studies show the 
            significant contribution of force-torque data to the robustness of our model.
          </p>
          <p>
            For additional qualitative results and ablation studies, including experiments carried out in our real-world environment, refer to the 
            supplementary material on this webpage!
          </p>
        </div>
      </div>
    </div>
    <br>
    <br>


  </div>


  <div class="columns is-centered has-text-centered">
    <div class="column is-two-thirds">
      <h2 class="title is-3">Video</h2>
      <div class="publication-video">
        <iframe src="https://www.youtube.com/embed/UTA7sefgs2o?si=rb-pFjDM_QlrQyeM"
                frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>
      </div>
    </div>
  </div>

</section>

<section class="section">
  <div class="container is-max-widescreen">

    <div class="rows">

    <div class="rows is-centered ">
      <div class="row is-full-width">
        <h2 class="title is-3">Model Architecture</h2>
      <img src="media/figures/model_architecture.png" class="interpolation-image" 
      alt="Interpolate start reference image." />
      </br>
      </br>
        <p>
          Our behavior cloning framework implementation is based off of <a target="_blank" href="https://robomimic.github.io">Robomimic</a>. To encode our observations, we draw upon the success of <a target="_blank" href="https://www.mmintlab.com/research/vtt/">visuotactile transformer</a> encoders and utilize a similar attention-based mechanism for RGB and tactile modality fusion. Rather than performing self-attention directly with the input tokens, we found that introducing a cross-attention step similar to the <a target="_blank" href="https://deepmind.google/discover/blog/building-architectures-that-can-handle-the-worlds-data/">PerceiverIO</a> architecture seemed to work best for our task. We tokenize our inputs by computing linear projections of both visual patches (as in <a target="_blank" href="https://arxiv.org/abs/2010.11929">vision transformers</a>) for RGB inputs and individual readings per timestep for the force-torque input, and then add modality-specific position encodings. We then cross-attend these input tokens with a set of 8 learned latent vectors that then travel through a series of self-attention layers before ultimately being compressed and projected (as in VTT) to an output latent embedding. We encode proprioception with a multilayer perceptron to get an output embedding and concatenate both output embeddings to get the input to the policy network. The policy network is then a multilayer perceptron that outputs 3-dimensional delta actions.
        </p>
        <br/>
        <br/>

        <h2 class="title is-3">Supplementary Experiments and Results</h2>

        <h3 class="title is-4">Additional Simulation Experiments</h3>

        <h4 class="title is-5">Attention Visualization</h4>

        <p>
          To gain further insight into the information being learned by our model, we visualize the attention weights in the latent vector cross-attention 
          step of the transformer visuotactile encoder. For each modality, we plot attention weights as proportions of total attention to tokens in that
          specific modality averaged over the 8 learned latent vectors. These weights are visualized as heatmaps overlayed on left and right wristview images 
          for visual attention, and bars for each timestep under the force reading for tactile attention. We also plot the proportion of total attention for 
          each modality (visual and tactile) during the course of a rollout.
        </p>
        <br/>
        <br/>
        <div class="columns"></div>
          <div class="column has-text-centered">
            <h3 class="title is-5">Attention Visualization: Model Trained on <i>Grasp Pose</i>, <i>Peg/Hole Shape</i>, and <i>Object Body Shape</i></h3>

            Model evaluated on
            <div class="select is-small">
              <select id="task-variation-attnvis" onchange="updateAttnVisVideo()">
              <option value="canonical" selected="selected">Canonical (No Variations)</option>
              <option value="grasp">Grasp Pose</option>
              <option value="peg_hole_shape">Peg/Hole Shape</option>
              <option value="body_shape">Object Body Shape</option>
              <option value="visual">Scene Appearance</option>
              <option value="camera_angle">Camera Pose</option>
              <option value="sensor_noise">Sensor Noise</option>
              <option value="all_vars">All Variations</option>
              </select>
            </div>
            task variation (videos sped up x2)
            <br/>
            <br/>
            <video id="attnvis-video"
                   muted
                   autoplay
                   loop
                   width="95%">
              <source src="media/results/attn_vis/canonical_attnvis.mp4"
                      type="video/mp4">
            </video>

            <p><strong>NOTE:</strong> 5 rollouts per video (50 rollouts for experiment results).</p>
          </div>
        </div>

        <p>
          <strong>Takeaways:</strong> Despite our model taking in twice as many visual tokens (72 tokens, 36 per view) as 
          tactile ones (32 tokens), we observe that tactile attention takes up almost the entire proportion of attention across
          the input (as seen in the right-most plot of the videos). This finding provides further evidence to the importance of tactile 
          information over visual information as discussed in our paper, where we found that removing visual information from our input had 
          little impact on the robustness of our model. Furthermore, we observe that the visual attention is mostly focused on semantically insignficant parts of the 
          scene, such as the gripper at the bottom of the view, suggesting that the model is not receiving much useful visual information.
        </p>
        <br/>
        <br/>

        <!-- Offline vs. online augmentation -->
        <h4 class="title is-5">Comparing Data Augmentation Methods for Model Robustness</h4>

        <p>
          In an effort to evaluate the validity of the online augmentation method for increasing the robustness of our model, we construct a dataset
          of human-generated trajectories with an <i>extended</i> set of visual variations and sensor noise, attempting to emulate a baseline data 
          augmentation method that applies augmentations independently to each sensory modality offline during training. More specifically, we generate 
          a dataset with training set variations of <i>Scene Appearance</i> (including object color, floor texture, and lighting), <i>Camera Pose</i>, and
          <i>Sensor Noise</i> with 12 augmentations per demonstration, but rather than keep applied variations consistent through each augmented rollout, we apply new instances of <i>Scene Appearance</i> 
          and <i>Camera Pose</i> variations in each step of the demonstration. We also multiply the force and torque history reading by a random constant (from
          0.1 to 2.0) independently determined each frame, following a similar data augmentation strategy used in <a href="https://arxiv.org/abs/2104.14223">InsertionNet</a>.
          We denote this dataset as <strong>Expanded Visual+Noise</strong>.
        </p>
        <br/>

        <div class="column has-text-centered">
          <video id="expanded_augments_video" 
                 muted
                 autoplay
                 loop
                 width="60%">
            <source src="media/video/offline_aug_dataset.mp4" type="video/mp4">
          </video>
          <p>
            Visualization of augmented observations collected for the <strong>Expanded Visual+Noise</strong> Dataset
          </p>
        </div>
        <br/>

        <p class="justify"></p>
          <img src="media/figures/ablation_training_set_extended.png" class="interpolation-image" width="640" align="right" style="margin:0% 4% " alt="Interpolate start reference image." />
          We report % success rate change from the canonical environment success rate on models trained on the <strong>Expanded Visual+Noise</strong> dataset and compare 
          it with the original training set models from our paper (namely <strong>Visual+Sensor Noise</strong> that does not apply new variation instances per frame and 
          <strong>Base</strong> that includes <i>Grasp Pose</i> variations).
        </p>
        <br/>

        <p>
          <strong>Takeaways:</strong> We observe that our dataset with an expanded set of augmentations independently applied to each sensory modality does not necessarily 
          improve robustness on most task variations (save for <i>Peg/Hole Shape</i>) compared to the original <strong>Visual+Sensor Noise</strong> dataset that was less extensive in terms of data augmentation. Most crucially, we
          do not see a significant improvement on <i>Grasp Pose</i> variations, validating the effect of non-independent multisensory data augmentation via trajectory replay. Thus, we have shown that even
          extensive independent augmentation of our multisensory input may not be enough to deal with certain task variations involved in our contact-rich task.
        </p>

        <br/>
        <br/>

        <!-- Canonical success rates for all experiments -->
        <h4 class="title is-5">Success Rates in Canonical Environment</h4>

        <p>
          For full transparency for our experiments that involve reporting the % sucess rate change from the <i>Canonical</i> environment, we explicitly report the success rates in the no-variations
          <i>Canonical</i> environment, which the % success rate change is based off of, for each trained model. Success rates are averaged across 6 training seeds, with error bars representing 
          one standard deviation from the mean. It is worth noting that the average % success rate change across the 6 training seeds was calculated by determining the % success rate change
          for each individual seed and then calculating the average over those values, rather than calculating the average success rate across the 6 seeds first and then determining the difference of those averages.
        </p>
        <br/>
        <br/>

        <p class="justify"></p>
          <img src="media/figures/ablation_training_set_wristviews_evals_canonical.png" class="interpolation-image" width="360" align="left" style="margin:2% 0% " alt="Interpolate start reference image." />
          <img src="media/figures/ablation_modinput_wristview_canonical_evals_canonical.png" class="interpolation-image" width="360" align="center" style="margin:0% 0% " alt="Interpolate start reference image." />
          <img src="media/figures/ablation_modinput_wristview_tclone_evals_canonical.png" class="interpolation-image" width="360" align="right" style="margin:0% 2% " alt="Interpolate start reference image." />
        </p>

        <br/>

        <p>
          <strong>Left Plot:</strong> Success rates on <i>Canonical</i> environment for models with different training set variations. This graph corresponds to the results reported in Figure 5 in our paper.
        </p>

        <br/>

        <p>
          <strong>Center Plot:</strong> Success rates on <i>Canonical</i> environment for models with different modality input combinations, trained on <i>No Variations</i>. This graph corresponds to the results reported in the top graph of Figure 7 in our paper.
        </p>

        <br/>
        
        <p>
          <strong>Right Plot:</strong> Success rates on <i>Canonical</i> environment for models with different modality input combinations, trained on <i>Grasp Pose</i>, <i>Peg/Hole Shape</i>, and <i>Object Body Shape</i>. This graph corresponds to the results reported in the bottom graph of Figure 7 in our paper. We especially note the instability of performance in the <strong>No Vision</strong> model, which provides context for its omission in the corresponding plot in our paper.
        </p>

        <br/>
        <br/>

        <h3 class="title is-4">Real World Experiments</h3>

        <!-- Real world setup definition -->

        <h4 class="title is-5">Real World Experiment Setup</h4>

        <p>
          Our real-world task setup is built to mirror our simulation setup as closely as possible. We designate one arm to be compliant,
          applying a constant amount of force while the other arm moves according to the actions given to it by the policy. In contrast to
          policies trained in simulation, our real-world policies predict 2-dimensional delta actions in the axes perpendicular to the 
          axis of insertion (rather than 3-dimensional actions that include the axis along the direction of insertion), in order to prevent potentially unsafe interactions that may occur as a result of a premature insertion attempt.
          Once the peg and hole are aligned, the compliant arm automatically moves its held object forward to complete the insertion. We train
          our real-world models with the same hyperparameters as those in simulation, although we only initiate 1 training seed per model
          (rather than 6). Additionally, we evaluate each model at the end of the entire training process, rather than performing training set 
          rollouts during the training process to determine the best checkpoint. Successes and failures follow the same general criteria as in 
          simulation, though a human manually annotates successes and failures per trial. 
        </p>
        <br/>
        <br/>

        <h4 class="title is-5">Real World Task Variation Difficulty</h4>

        <!-- Canonical experiments graph -->

        <p class="justify">
          <img src="media/figures/ablation_real_world_canonical.png" class="interpolation-image" width="640" align="right" style="margin:0% 4% " alt="Interpolate start reference image." />
          As a real-world analog to our simulation experiment aimed towards determining the difficulty of generalization for each of our task 
          variations, we evaluate a real-world policy trained on a dataset of human-generated demonstrations with no applied task variations
          on real-world versions of a subset of our task variations. Reported success rates over 20 rollouts can be found in the accompanying figure.
        </p>
        <br/>

        <p>
          <strong>Takeaways:</strong> Like in simulation, we observe that <i>Grasp Pose</i> variations seem to be the most difficult to generalize
          to, while the model is able to handle the mostly unisensory perturbations of <i>Object Body Shape</i> and <i>Scene Appearance</i> (Object Color and Lighting). We also 
          notice that our model struggles with <i>Grasp Pose</i> even when rotational grasp variations are removed; we hypothesize that this may be
          because a translational offset disrupts the desired behavior of lining up end-effector positions (given from proprioceptive input) in order
          to line up the peg and hole (i.e. solving the task can no longer be done by just matching the end-effector positions of the two arms). 
          Including <i>Grasp Pose</i> variations into the training dataset (as was done in simulation through online augmentation) may also improve
          performance in the real world.
        </p>
        <br/>
        <br/>

        <!-- Canonical experiments visualization -->

        <div class="columns">
          <div class="column has-text-centered">
            <h3 class="title is-5">Real World Rollouts: Model Trained on <i>No Variations</i></h3>

            Model evaluated on
            <div class="select is-small">
              <select id="task-variation" onchange="updateRealWorldVideo()">
              <option value="canonical" selected="selected">Canonical (No Variations)</option>
              <option value="grasp">Grasp Pose (XT+ZR)</option>
              <option value="body_shape">Object Body Shape</option>
              <option value="visual">Visual (Lighting/Object Color)</option>
              </select>
            </div>
            task variation (videos sped up x2)
            <br/>
            <br/>
            <video id="real-world-canonical-video"
                   muted
                   autoplay
                   loop
                   width="80%">
              <source src="media/results/real_world/real_world_canonical.mp4"
                      type="video/mp4">
            </video>
            <br/>
            <br/>
            <p><strong>NOTE:</strong> 5 rollouts per video (20 rollouts for experiment results).</p>
          </div>
        </div>

        <br/>
        <br/>

        <!-- Modality input ablation graph -->

        <h4 class="title is-5">Real World Sensory Input Ablation Study</h4>

        <p class="justify">
          <img src="media/figures/ablation_real_world_modinput.png" class="interpolation-image" width="640" align="right" style="margin:0% 4% " alt="Interpolate start reference image." />
          We conduct a reduced real-world analog to the ablation study in our paper that investigated how much each sensory modality contributed 
          to overall model performance by training models that only accepted a subset of input modalities (vision, touch, and/or proprioception).
          We train real-world policies on a dataset of only human demonstrations and evaluate them on a smaller subset of our real-world task variations.
          Reported success rates over 20 rollouts can be found in the accompanying figure.
        </p>
        <br/>

        <p>
          <strong>Takeaways:</strong> Like in simulation, we observe that the removal of force-torque data as input (the <strong>No Touch</strong> model) leads to a significant drop in
          success rate for all task variations compared to the <strong>Full Model</strong>, including the no-variations <i>Canonical</i> environment. We also see a small drop in performance
          for the <strong>No Vision</strong> model, somewhat aligning with our findings in simulation of the insignificance of visual input for our task. Surprisingly, we see performance 
          increases in all task variations for the <strong>No Prop.</strong> model. We hypothesize that the small ranges of possible end-effector poses in our training dataset due to the  
          high precision required for our task may cause our models to not learn much useful information from the proprioceptive embedding, though this observation may of course also be 
          the result of a low sample size of trained models. Averaging the performance of models trained over multiple seeds (as was done in simulation), which was not able to be performed 
          due to time constraints, may give us some more robust results.
        </p>
      </div>
    </div>
  </div>
</section>

<section class="section" id="BibTeX">
  <div class="container is-max-widescreen content">
    <h2 class="title">BibTeX</h2>
    <pre><code>@article{diaz2024auginsert,
  title     = {AugInsert: Learning Robust Visual-Force Policies via Data Augmentation for Object Assembly Tasks}, 
  author    = {Diaz, Ryan and Imdieke, Adam and Veeriah, Vivek and Desingh, Karthik},
  booktitle = {arXiv:2410.14968},
  year      = {2024},
}</code></pre>
  </div>
</section>


<footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column">
        <div class="content has-text-centered">
          <p>
            This website template was borrowed from <a href="https://github.com/nerfies/nerfies.github.io">NeRFies</a> made by <a href="https://keunhong.com/">Keunhong Park</a> and <a href="https://peract.github.io">PerAct</a> made by <a href="https://mohitshridhar.com">Mohit Shridhar</a>. 
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>


</body>
</html>