index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="DriveDreamer: Towards Real-world-driven World Models for Autonomous Driving">
  <meta name="keywords" content="DriveDreamer">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>DriveDreamer: Towards Real-world-driven World Models for Autonomous Driving</title>

  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/ms_icon.png">

  <style>  
    table {  
      font-family: arial, sans-serif;  
      border-collapse: collapse;  
      width: 100%;  
    }  
      
    td, th {  
      border: 2px solid #F1F4F5;  
      text-align: left;  
      padding: 8px;  
    }  
    tr:nth-child(3n - 1) {  
      background-color: #F1F4F5;  
    }  

    tr:nth-child(3n) {  
      border: 2px solid #FFFFFF;
    }  
  </style>

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>

<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">DriveDreamer: Towards Real-world-driven <br> World Models for Autonomous Driving </h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a href="https://scholar.google.com/citations?user=96lsfiUAAAAJ&hl">Xiaofeng Wang</a><sup>1*</sup>,</span>
            <span class="author-block">
              <a href="https://scholar.google.com/citations?user=NmwjI0AAAAAJ&hl">Zheng Zhu</a><sup>1*</sup>,</span>
            <span class="author-block">
              <a >Guan Huang</a><sup>1,2</sup>,</span>
            <span class="author-block">
              <a >Xinze Chen</a><sup>1</sup>,</span>
            <span class="author-block">
              <a >Jiagang Zhu</a><sup>1</sup>,</span>
            <span class="author-block">
              <a href="https://scholar.google.com/citations?user=TN8uDQoAAAAJ&hl">Jiwen Lu</a><sup>2</sup>,</span>
            </span><br>

          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block">GigaAI<sup>1</sup>,</span>&nbsp;
            <span class="author-block">Tsinghua University<sup>2</sup></span>&nbsp;
          </div>
          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>*</sup>Equal Contribution</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a href="https://arxiv.org/pdf/2309.09777.pdf"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <!-- Code Link. -->
              <span class="link-block">
                <a href="https://github.com/JeffWang987/DriveDreamer"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
        <img id="teaser" autoplay muted loop playsinline height="100%" src="./static/images/abs.png" style="width:100%;height:100%;">
      <p  style="font-size: 16px;"> 
        DriveDreamer excels in controllable driving video generation, aligning seamlessly with text prompts and structured traffic constraints. DriveDreamer can also interact with the driving scene and predict different future driving videos, based on input driving actions. Furthermore, DriveDreamer extends its utility to anticipate future driving actions.
      </p>
    </div>
  </div>
</section>


<section class="section">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            World models, especially in autonomous driving, are trending and drawing extensive attention due to its 
            capacity for comprehending driving environments. The established world model holds immense potential 
            for the generation of high-quality driving videos, and driving policies for safe maneuvering. However, 
            a critical limitation in relevant research lies in its predominant focus on gaming environments or simulated 
            settings, thereby lacking the representation of real-world driving scenarios. Therefore, we introduce 
            DriveDreamer, a pioneering world model entirely derived from real-world driving scenarios. Regarding that 
            modeling the world in intricate driving scenes entails an overwhelming search space, we propose harnessing 
            the powerful diffusion model to construct a comprehensive representation of the complex environment. Furthermore, 
            we introduce a two-stage training pipeline. In the initial phase, DriveDreamer acquires a deep understanding of 
            structured traffic constraints, while the subsequent stage equips it with the ability to anticipate future states. 
            The proposed DriveDreamer is the first world model established from real-world driving scenarios. We instantiate 
            DriveDreamer on the challenging nuScenes benchmark, and extensive experiments verify that DriveDreamer empowers precise,
            controllable video generation that faithfully captures the structural constraints of real-world traffic scenarios.  
            Additionally, DriveDreamer enables the generation of realistic and reasonable driving policies, opening avenues for 
            interaction and practical applications.
          </p>
        </div> 
      </div>
    </div>
    <!--/ Abstract. -->


  </div>
</section>


<section class="section" id="Method">
  <div class="container is-max-desktop content">
    <h2 class="title">Method</h2>
    <section class="hero method">
      <div class="container is-max-desktop">
        <div class="hero-body">
          <img id="method" autoplay muted loop playsinline height="100%" src="./static/images/method.png" style="width:100%;height:100%;">
          <p>
            The DriveDreamer framework begins with an initial reference frame and its corresponding road structural information (i.e., HDMap and 3D box). Within this context, DriveDreamer leverages the proposed ActionFormer to predict forthcoming road structural features in the latent space. These predicted features serve as conditions and are provided to Auto-DM, which generates future driving videos. Simultaneously, the utilization of text prompts allows for dynamic adjustments to the driving scenario style (e.g., weather and time of the day). Moreover, DriveDreamer incorporates historical action information and the multi-scale latent features extracted from Auto-DM, which are combined to generate reasonable future driving actions.  In essence, DriveDreamer offers a comprehensive framework that seamlessly integrates multi-modal inputs to generate future driving videos and driving policies, thereby advancing the capabilities of autonomous-driving systems.
			<!--/
            Method overview: Starting from a randomly sampled latent code <span class="math inline">\(x_{T}^{1}\)</span>, we apply <span class="math inline">\(\Delta t\)</span> DDIM backward steps to obtain <span class="math inline">\(x_{T'}^{1}\)</span> using a pre-trained Stable Diffusion model (SD). A specified motion field results for each frame <span class="math inline">\(k\)</span> in a warping function <span class="math inline">\(W_k\)</span> that turns <span class="math inline">\(x_{T'}^{1}\)</span> to <span class="math inline">\(x_{T'}^{k}\)</span>. By enhancing the latent codes with motion dynamics, we determine the global scene and camera motion and achieve temporal consistency in the background and the global scene.
            A subsequent DDPM forward application delivers latent codes <span class="math inline">\(x_{T}^{k}\)</span> for <span class="math inline">\(k=1,\ldots,m\)</span>. By using the (probabilistic) DDPM method, a greater degree of freedom is achieved with respect to the motion of objects.
            Finally, the latent codes are passed to our modified SD model using the proposed cross-frame attention, which uses keys and values from the first frame to generate the image of frame <span class="math inline">\(k=1,\ldots,m\)</span>. By using cross-frame attention,  the appearance and the identity of the foreground object are preserved  throughout the sequence. 
            Optionally, we apply background smoothing. To this end, we employ salient object detection to obtain for each frame <span class="math inline">\(k\) a mask <span class="math inline">\(M^{k}\)</span> indicating the foreground pixels. Finally, for the background (using the mask <span class="math inline">\(M^{k}\)</span>), a convex combination between the latent code <span class="math inline">\(x_{t}^{1}\)</span> of frame one warped to frame <span class="math inline">\(k\)</span> and the latent code <span class="math inline">\(x_{t}^{k}\)</span> is used to further improve the temporal consistency of the background.-->
          </p>
        </div>
      </div>
    </section>
  </div>
</section>


<section class="section" id="Results">
  <div class="container is-max-desktop content">
    <h2 class="title">Results</h2>
    <section class="hero method">
    <div class="container is-max-desktop">
    <div class="hero-body">  

  <h4 class="title">1. Diverse Driving Video Generation.</h4>
    <div class="container is-max-desktop">
        <video preload="auto"poster="" id="tree" autoplay controls muted loop width="12000px" outline="0px"> 
          <source src="./static/videos/diverse.mp4"
          type="video/mp4">
        </video>
    </div> 
    <br>
    <p style="margin-bottom: 30px;"></p>

	<h4 class="title">2. Driving Video Generation with Traffic Condition and Different Text Prompts (Sunny, Rainy, Night)</h4>
  <div class="container is-max-desktop">
    <div style="text-align: center;">
      <video preload="auto" poster="" id="tree" autoplay controls muted loop width="1200px" outline="0px" style="display: inline-block;">
        <source src="./static/videos/text_driven.mp4" type="video/mp4">
      </video>
    </div>
  </div>
  <br>
	<p style="margin-bottom: 30px;"></p>

  <h4 class="title">3. Future Driving Video Generation with Action Interaction.</h4>

  <div class="container is-max-desktop">
    <div style="text-align: center;">
      <video preload="auto" poster="" id="tree" autoplay controls muted loop width="1000px" outline="0px" style="display: inline-block;">
        <source src="./static/videos/action_interact_concat.mp4" type="video/mp4">
      </video>
    </div>
  </div>
  <br>
	<p style="margin-bottom: 30px;"></p>

	<h4 class="title">4. Future Driving Action Generation.</h4>

  <div class="container is-max-desktop">
    <div style="text-align: center;">
      <video preload="auto" poster="" id="tree" autoplay controls muted loop width="1000px" outline="0px" style="display: inline-block;">
        <source src="./static/videos/action_pred_concat.mp4" type="video/mp4">
      </video>
    </div>
  </div>

  </div></div></section>
  </div>
</section>

	
<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <p> If you use our work in your research, please cite: </p>
    <pre><code>@article{wang2023drive,
  title={DriveDreamer: Towards Real-world-driven World Models for Autonomous Driving},
  author={Wang, Xiaofeng and Zhu, Zheng and Huang, Guan and Chen, Xinze and Zhu, Jiagang and Lu, Jiwen},
  journal={arXiv preprint arXiv:2309.09777},
  year={2023}
}</code></pre>
  </div>
</section>

<footer class="footer">
  <div class="container">
    <div class="content has-text-centered">
      <a class="icon-link" href="https://github.com/JeffWang987/DriveDreamer" class="external-link" disabled>
        <i class="fab fa-github"></i>
      </a>
    </div>
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            Website adapted from the following <a href="https://github.com/nerfies/nerfies.github.io">template</a>.
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>