index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>GEM: A Generalizable Ego-Vision Multimodal World Model for Fine-Grained Ego-Motion, Object Dynamics, and Scene Composition Control</title>
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <style>
      .video-container {
        display: grid;
        grid-template-columns: repeat(2, 1fr); /* Four equal columns for videos */
        gap: 10px; /* Space between items */
        margin: 20px 0; /* Add vertical spacing between sections */
        align-items: stretch; /* Ensure items fill the grid cell vertically */
        justify-items: stretch; /* Ensure items fill the grid cell horizontally */
      }
      .video-item {
      display: flex;
      flex-direction: column;
      align-items: center; /* Center align descriptions and videos */
    }

    .video-description {
      margin-bottom: 10px;
      font-weight: bold; /* Make the description stand out */
    }

    .section-description {
      margin-bottom: 30px;
      font-size: 1.2em;
      color: #555; /* Subtle color to differentiate from headings */
    }
      .content {
      margin-bottom: 20px; /* Add space between sections (subtitles and videos) */
      }
      video, img {
        width: 100%; /* Make both videos and GIFs fill their grid cells */
        height: 100%; /* Fill the height of the grid cell */
        border: 1px solid #ccc; /* Optional: Add a border */
        border-radius: 5px; /* Optional: Add rounded corners */
        object-fit: cover; /* Ensure content fills the container */
      }
     
      .pseudo-video-container {
        display: flex; /* Use flexbox for layout */
        flex-direction: column; /* Arrange items vertically */
        justify-content: center; /* Center the videos vertically */
        align-items: center; /* Center the videos horizontally */
        gap: 20px; /* Add spacing between videos */
      }
      .pseudo-video-container video {
      width: 100%; /* Each video takes 50% of the row */
      height: 800; /* Set a large height for the videos */
      max-height: 100%; /* Prevent overflow */
      border: 1px solid #ccc; /* Optional: Add a border */
      border-radius: 5px; /* Optional: Add rounded corners */
      object-fit: cover; /* Ensures the video content scales nicely */
      }
</style>
   
</head>
<body>
  <!-- <div style="background-color: #e6ffe6; padding: 10px; text-align: center; color: green; font-weight: bold;">
    We recommend using Chrome for the best experience. Videos might not work on Safari.
  </div> -->
<section class="hero title-section" style="background-color: #f0f0f0;">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1">
            💎 GEM: A Generalizable Ego-Vision Multimodal World Model for Fine-Grained Ego-Motion, Object Dynamics, and Scene Composition Control 
          </h1>

      <!-- Author List -->
      <div class="author-list" style="margin-top: 20px; font-size: 1.1rem;">
        Mariam Hassan<sup>&#9733;1</sup>, Sebastian Stapf<sup>&#9733;2</sup>, Ahmad Rahimi<sup>&#9733;1</sup>, 
        Pedro M B Rezende<sup>&#9733;2</sup>, Yasaman Haghighi<sup>&#9830;1</sup>, <br>
        David Brüggemann<sup>&#9830;3</sup>, Isinsu Katircioglu<sup>&#9830;3</sup>, Lin Zhang<sup>&#9830;3</sup>, 
        Xiaoran Chen<sup>&#9830;3</sup>, Suman Saha<sup>&#9830;3</sup>, <br>
        Marco Cannici<sup>&#9830;4</sup>, Elie Aljalbout<sup>&#9830;4</sup>, Botao Ye<sup>&#9830;5</sup>, 
        Xi Wang<sup>&#9830;5</sup>, Aram Davtyan<sup>2</sup>, <br>
        Mathieu Salzmann<sup>1,3</sup>, Davide Scaramuzza<sup>4</sup>, Marc Pollefeys<sup>5</sup>, 
        Paolo Favaro<sup>2</sup>, Alexandre Alahi<sup>1</sup>
      </div>

      <!-- Affiliations -->
      <div class="affiliations" style="margin-top: 20px; font-size: 1rem;">
        <sup>1</sup>École Polytechnique Fédérale de Lausanne (EPFL), 
        <sup>2</sup>University of Bern, <br>
        <sup>3</sup>Swiss Data Science Center, 
        <sup>4</sup>University of Zurich, 
        <sup>5</sup>ETH Zurich
      </div>

      <!-- Contribution Legend -->
      <div class="contribution-legend" style="margin-top: 20px; font-size: 1rem; font-style: italic;">
        <sup>&#9733;</sup> Main Contributors &nbsp;&nbsp; <sup>&#9830;</sup> Data Contributors
      </div>

      <!-- arXiv Link -->
      <span class="link-block" style="margin-left: 10px;">
        <a href="https://arxiv.org/abs/2412.11198" 
          class="external-link button is-normal is-rounded is-dark">
          <span class="icon">
            <i class="ai ai-arxiv"></i>
          </span>
          <span>arXiv</span>
        </a>
      </span>
      
      <!-- Code Link -->
      <span class="link-block" style="margin-left: 10px;">
        <a href="https://github.com/vita-epfl/GEM" 
          class="external-link button is-normal is-rounded is-dark">
          <span class="icon">
            <i class="fab fa-github"></i>
          </span>
          <span>Code</span>
        </a>
      </span>
        </div>
        
      </div>
    </div>
  </div>
</section>

<!-- Other Sections -->


<section class="section">
  <div class="container has-text-centered">
    <h2 class="title is-2">Unconditional Generations</h2>
    <p class="section-description">
      We show examples of unconditional generations from the model in diverse scenes with different driving dynamics.
    </p>
    <div class="video-container">
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/generation/OpenDV_002133.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/generation/OpenDV_000265.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/generation/OpenDV_000322.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
            <source src="./static/videos/generation/OpenDV_000637.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/generation/OpenDV_001165.mp4" type="video/mp4">
  </video>
  <video autoplay muted loop playsinline preload="metadata">
    <source src="./static/videos/generation/OpenDV_001591.mp4" type="video/mp4">
</video>


<video autoplay muted loop playsinline preload="metadata">
  <source src="./static/videos/generation/OpenDV_002209.mp4" type="video/mp4">
</video>
<video autoplay muted loop playsinline preload="metadata">
  <source src="./static/videos/generation/OpenDV_002847.mp4" type="video/mp4">
</video>
<video autoplay muted loop playsinline preload="metadata">
  <source src="./static/videos/generation/NUSCENES_000071.mp4" type="video/mp4">
</video>
<video autoplay muted loop playsinline preload="metadata">
  <source src="./static/videos/generation/NUSCENES_000964.mp4" type="video/mp4">
</video>

    </div>
  </div>
</section>


<section class="section">
   <div class="container has-text-centered">
    <h2 class="title is-2">Ego Control</h2>
    <p class="section-description">
      We show examples of ego-motion controllability. All videos are generated by GEM using the same starting frame but with different trajectory control input. <br>
      We observe that the model follows the control signals and generates realistic scenes.
    </p>
    <div class="video-container">
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/ego/OpenDV_001365-1.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
            <source src="./static/videos/ego/OpenDV_001365-2.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
            <source src="./static/videos/ego/OpenDV_001161-left.mp4" type="video/mp4">
          </video>
          <video autoplay muted loop playsinline preload="metadata">
            <source src="./static/videos/ego/OpenDV_001162.mp4" type="video/mp4">
          </video>
         
    </div>
  </div>
</section>

<section class="section">
   <div class="container has-text-centered">
    <h2 class="title is-2">Object Manipulation</h2>
    <p class="section-description">
      GEM can move objects in the scene using DINO features. <br>
      In the following examples, we show an unconditional generation by GEM and the same generation with motion control. <br>
      The green box indidcates the source DINO features and the blue ones indicate the target position tokens used. <br>
      We observe that the object moves from the green box to the blue box.
    </p>
    <div class="video-container">
      <div class="video-item">
        <p class="video-description">Unconditional Generation</p>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/move/demo6_uc.mp4" type="video/mp4">
      </video>
      </div>
      <div class="video-item">
        <p class="video-description">Object Motion Control</p>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/move/demo6_1_c.mp4" type="video/mp4">
      </video>
      </div>
      
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/move/bike_uc.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/move/bike_c.mp4" type="video/mp4">
      </video>

      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/move/demo8_uc.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/move/demo8_c.mp4" type="video/mp4">
      </video>
      
    </div>
    </div>
    
  </div>
</section>

<section class="section">
  <div class="container has-text-centered">
    <h2 class="title is-2">Object Insertion</h2>

    <!-- First Row of Videos -->
    <div class="video-container">
      <div class="video-item">
        <p class="video-description">Unconditional Generation</p>
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/insert/insert1_uc.mp4" type="video/mp4">
        </video>
      </div>
      <div class="video-item">
        <p class="video-description">Insertion Control</p>
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/insert/insert1.mp4" type="video/mp4">
        </video>
      </div>
      
    </div>

    <!-- Text between rows -->
    <p style="margin: 10px 0; font-size: 1.0rem;">
      In the following example, we insert a car on the left and control the motion of another car on the right.
    </p>
    <!-- Second Row of Videos -->
    <div class="video-container">
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/insert/demo1_uc.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/insert/demo1_c.mp4" type="video/mp4">
      </video>
    </div>
  </div>
</section>


<section class="section">
      <div class="container has-text-centered">
    <h2 class="title is-2">Human Pose Manipulation</h2>
    <p class="section-description">
      GEM can use human poses to control the motion of pedestrians in the scene. <br>
      In this examples, the pedesterians are crossing the street or stopping based on the human poses controls.
    </p>
    <div class="video-container">
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/human/OpenDV_000000_walk.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/human/OpenDV_000000.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/human/OpenDV_000001_walk.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/human/OpenDV_000001.mp4" type="video/mp4">
      </video>
        
      <!-- <video autoplay muted loop playsinline preload="metadata">
        <source src="path/to/human/video3.mp4" type="video/mp4">
      </video> -->
    </div>
  </div>
</section>

<section class="section">
      <div class="container has-text-centered">
    <h2 class="title is-2">Long Generation</h2>
    <p class="section-description">
      We compare our long generation with the only world model trained on OpenDV capable of generating long sequences.<br>
       We observe that our generations have higher ego motion temporal consistency and more realistic dynamics.
    </p>

    <div class="video-container">
      <!-- Video Item 1 -->
      
      <div class="video-item">
        <p class="video-description">GEM</p>
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/long/OpenDV-250_000291.mp4" type="video/mp4">
        </video>
      </div>
      <div class="video-item">
        <p class="video-description">Vista</p>
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/long/OpenDV_000291_LONG_vista.mp4" type="video/mp4">
        </video>
      </div>
      <div class="video-item">
        <!-- <p class="video-description">GEM</p> -->
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/long/OpenDV_000276_LONG.mp4" type="video/mp4">
        </video>
      </div>
      <div class="video-item">
        <!-- <p class="video-description">Vista</p> -->
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/long/OpenDV_000276_LONG_vista.mp4" type="video/mp4">
        </video>
      </div>
      <div class="video-item">
        <!-- <p class="video-description">GEM</p> -->
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/long/OpenDV_000047_LONG.mp4" type="video/mp4">
        </video>
      </div>
      <div class="video-item">
        <!-- <p class="video-description">Vista</p> -->
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/long/OpenDV_000047_LONG_vista.mp4" type="video/mp4">
        </video>
      </div>

      <div class="video-item">
        <!-- <p class="video-description">GEM</p> -->
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/long/OpenDV_000395_LONG.mp4" type="video/mp4">
        </video>
      </div>
      <div class="video-item">
        <!-- <p class="video-description">Vista</p> -->
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/long/OpenDV_000395_LONG_vista.mp4" type="video/mp4">
        </video>
      </div>     
  </div>
</section>


<section class="section">
  <div class="container has-text-centered">
    <h2 class="title is-2">Interesting Observations</h2>
    <!-- Section Description -->
    <p class="section-description">
      We show interesting behaviors observed in the generated videos. <br> These behaviors do not necessarily exist in the ground truth videos, but emerge from the model's learned dynamics.
    </p>
    <!-- Video Grid -->
    <div class="video-container">
      <!-- Video Item 1 -->
      <div class="video-item">
        <p class="video-description">Break Lights go off before moving</p>
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/interesting/OpenDV-250_000291.mp4" type="video/mp4">
        </video>
      </div>
      <!-- Video Item 2 -->
      <div class="video-item">
        <p class="video-description">Smooth takeover dynamics on a long generation </p>
        <video autoplay muted loop playsinline preload="metadata">
          <source src="./static/videos/interesting/OpenDV-250_000287.mp4" type="video/mp4">
        </video>
      </div>
      </div>
    </div>
  </div>
</section>

<!-- MultiModal Section -->
<section class="section">
    <div class="container has-text-centered">
    <h2 class="title is-2">MultiModal</h2>
    <p class="section-description">
      GEM generates two modalities simultaneously: RGB and Depth. We show examples of multimodal generations.
    </p>
    <!-- Images Row -->
    <div class="video-container">
      
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/multimodal/OpenDV_000226.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/multimodal/OpenDV_000226_depth.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/multimodal/OpenDV_000195.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/multimodal/OpenDV_000195_depth.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/multimodal/OpenDV_001132.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/multimodal/OpenDV_001132_depth.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/multimodal/OpenDV_000221.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/multimodal/OpenDV_000221_depth.mp4" type="video/mp4">
      </video>
    </div>
   
  
  </div>
 
</section>
<!-- MultiDomain Section -->
<section class="section">
  <div class="container has-text-centered">
    <h2 class="title is-2">MultiDomain</h2>
    <p class="section-description">
      GEM is finetuned on two other ego centric domains and we observe it quickly adapts to these new domains.
    </p>
    
    <!-- Drone Flights -->
    <div class="content">
      <h3 class="subtitle is-3">1-Drone Flights</h3>
      <div class="video-container">
        <img src="./static/videos/multidomain/1.gif" alt="Drone Flights GIF">
        <img src="./static/videos/multidomain/2.gif" alt="Drone Flights GIF">
        <img src="./static/videos/multidomain/3.gif" alt="Drone Flights GIF">
        <img src="./static/videos/multidomain/4.gif" alt="Drone Flights GIF">
      </div>
    </div>
    
    <!-- Human EgoCentric -->
    <div class="content">
      <h3 class="subtitle is-3">2-Human EgoCentric</h3>
      <div class="video-container">
        <img src="./static/videos/multidomain/5.gif" alt="Human EgoCentric GIF">
        <img src="./static/videos/multidomain/6.gif" alt="Human EgoCentric GIF">
        <img src="./static/videos/multidomain/7.gif" alt="Human EgoCentric GIF">
        <img src="./static/videos/multidomain/8.gif" alt="Human EgoCentric GIF">
      </div>
    </div>
  </div>
</section>


<!-- Pseudo labels Section -->
<section class="section">
  <div class="container has-text-centered">
    <h2 class="title is-2">Pseudo-Labelling</h2>
    <p class="section-description">
      Some visualisations of the outputs of our pseudo-labeling pipeline.
    </p>
    <div class="pseudo-video-container">
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/pseudo/pseudo_label.mp4" type="video/mp4">
      </video>
      <video autoplay muted loop playsinline preload="metadata">
        <source src="./static/videos/pseudo/pseudo_label_1.mp4" type="video/mp4">
      </video>
    </div>
  </div>
</section>


</body>
</html>