index.html

<!DOCTYPE html>
<html>
<head>
  <title>VidEgoThink</title>
    <style>
        .hidden {
            display: none;
        }
    </style>
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
  <meta charset="utf-8">
  <meta name="description"
        content="Assessing Egocentric Video Understanding Capabilities for Embodied AI">
  <meta name="keywords" content="VidEgoThink, Egocentric, Embodied AI, Vision Language Model, First-Person, Benchmark, Artificial Intelligence, AI, AGI">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title> VidEgoThink: Assessing Egocentric Video Understanding Capabilities for Embodied AI</title>

  <link rel="icon" href="./static/images/VidEgoThink_icon.png">

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="stylesheet" href="./static/css/leaderboard.css">

  <!-- <link href="https://unpkg.com/tabulator-tables@5.5.2/dist/css/tabulator_bulma.min.css" rel="stylesheet">
  <script type="text/javascript" src="https://unpkg.com/tabulator-tables@5.5.2/dist/js/tabulator.min.js"></script> -->
  <script type="text/javascript" src="static/js/sort-table.js" defer></script>

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
  <script src="./static/js/question_card.js"></script>
  <script src="./data/results/data_setting.js" defer></script>
  <script src="./data/results/model_scores.js" defer></script>
  <script src="./visualizer/data/data_public.js" defer></script>
</head>
<body>

<nav class="navbar" role="navigation" aria-label="main navigation">
  <div class="navbar-brand">
    <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
    </a>
  </div>
  <div class="navbar-menu">
    <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
      <div class="navbar-item has-dropdown is-hoverable">
        <a class="navbar-link">
          More Research
        </a>
        <div class="navbar-dropdown">
          <!-- <a class="navbar-item" href="https://github.com/imoneoi/openchat">
            <b>OpenChat</b> <p style="font-size:18px; display: inline; margin-left: 5px;">🔥</p>
          </a> -->
          <a class="navbar-item" href="https://adacheng.github.io/EgoThink/">
            <b>EgoThink</b>
          </a>
          <a class="navbar-item" href="https://zhichengg.github.io/stb.github.io/">
            <b>StableToolBench</b>
          </a>
        </div>
      </div>
    </div>

  </div>
</nav>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title is-bold">
            <img src="static/images/VidEgoThink_icon.png" style="width:1em;vertical-align: middle" alt="Logo"/> 
            <span class="mmmu" style="vertical-align: middle">VidEgoThink</span>
            </h1>
          <h2 class="subtitle is-3 publication-subtitle">
            Assessing Egocentric Video Understanding Capabilities for Embodied AI 
          </h2>
          <div class="is-size-5 publication-authors">
            <span class="author-block">Sijie Cheng<sup style="color:#6fbf73;">†,1,2,6</sup>,</span>
            <span class="author-block">Kechen Fang*<sup style="color:#ffac33;">2,5</sup>,</span>
            <span class="author-block">Yangyang Yu*<sup style="color:#ed4b82;">2,5</sup>,</span>
            <span class="author-block">Sicheng Zhou*<sup style="color:#007bff;">2,3</sup>,</span><br>
            <span class="author-block">Bohao Li<sup style="color:#ed4b82;">4,6</sup>,</span>
            <span class="author-block">Ye Tian<sup style="color:#9b51e0;">6</sup>,</span>
            <span class="author-block">Tingguang Li<sup style="color:#9b51e0;">6</sup>,</span>
            <span class="author-block">Lei Han<sup style="color:#9b51e0;">✉,6</sup>,</span>
            <span class="author-block">Yang Liu<sup style="color:#ed4b82;">✉,1,2</sup>,</span>
          </div>
          
          <br>
          
          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup style="color:#6fbf73;">1</sup>Department of Computer Science and Technology, Tsinghua University</span><br>
            <span class="author-block"><sup style="color:#ffac33;">2</sup>Institute for AI Industry Research (AIR), Tsinghua University</span><br>
            <span class="author-block"><sup style="color:#ed4b82;">3</sup>Department of Mechanical and Industrial Engineering, University of Toronto</span><br>
            <span class="author-block"><sup style="color:#007bff;">4</sup>School of Data Science, The Chinese University of HongKong</span></br>
            <span class="author-block"><sup style="color:#ffac33;">5</sup>Zhili College, Tsinghua University</span>
            <span class="author-block"><sup style="color:#9b51e0;">6</sup>Tencent Robotics X</span><br>
            <!-- <span class="author-block"><sup style="color:#ed4b82;">6</sup>University of Victoria,</span>
            <span class="author-block"><sup style="color:#9b51e0;">7</sup>Princeton University</span> -->
          </div>

          <br>
          <div class="is-size-5 publication-authors">
            <span class="author-block">*Equal contribution, ✉Corresponding author</span><br>
            <span class="author-block">†Project Lead:</span>
            <span class="author-block"><a href="mailto:csj23@mails.tsinghua.edu.cn">csj23@mails.tsinghua.edu.cn</a></span>
            <!-- <span class="author-block"><a href="mailto:su.809@osu.edu">su.809@osu.edu</a>,</span>
            <span class="author-block"><a href="mailto:wenhuchen@uwaterloo.ca">wenhuchen@uwaterloo.ca</a></span> -->
          </div>
          

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a href="https://arxiv.org/pdf/2410.11623"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <span class="link-block">
                <a href="https://huggingface.co/papers/2410.11623"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <p style="font-size:18px">🤗</p>
                  </span>
                  <span>HF Paper</span>
                </a>
              </span>
              <!-- <span class="link-block"> -->
                <!-- @TODO: change links -->
                <!-- <a href="https://huggingface.co/datasets/" -->
                   <!-- class="external-link button is-normal is-rounded is-dark"> -->
                  <!-- <span class="icon"> -->
                      <!-- <i class="far fa-images"></i> -->
                      <!-- <p style="font-size:18px">🤗</p> -->
                      <!-- 🔗 -->
                  <!-- </span> -->
                  <!-- <span>Dataset</span> -->
                <!-- </a> -->
              <!-- </span> -->
              <span class="link-block">
                <a href="https://github.com/AdaCheng/VidEgoThink"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
              <!-- Dataset Link. -->
             
              
              <!-- Leaderboard Link. -->
              <span class="link-block">
                <a href="#leaderboard"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <p style="font-size:18px">🏆</p>
                  </span>
                  <span>Leaderboard</span>
                </a>
              </span>
              <!-- Visualization Link. -->
              <!-- <span class="link-block">
                <a href="#examples"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <p style="font-size:18px">📖</p>
                  </span>
                  <span>Examples</span>
                </a>
              </span> -->
              <!-- Twitter Link. -->
              <!-- <span class="link-block">
                <a href="https://twitter.com/xiangyue96/status/1729698316554801358"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon"> -->
                      <!-- <i class="far fa-images"></i> -->
                      <!-- 💻🔗 -->
                      <!-- <p style="font-size:18px">🌐</p> -->
                  <!-- </span>
                  <span>Twitter</span>
                </a>
              </span> -->
            </div>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>
<style>
  .center {
    display: block;
    margin-left: auto;
    margin-right: auto;
    width: 80%;
  }
  </style>
<section class="hero teaser">
  <div class="container is-max-desktop">
    <!-- <div class="hero-body">
      <img src="static/images/tease_scores.png" alt="Examples from the dataset"/>
      <h2 class="subtitle has-text-centered">
        <span class="dnerf">Nerfies</span> turns selfie videos from your phone into
        free-viewpoint
        portraits.
      </h2>
    </div> -->
      <!-- <div class="box m-5"> --> 
        <div class="content has-text-centered">
          <img src="static/images/intro.png" alt="geometric reasoning" />
          <p> <b><i>Figure 1:</i></b> The main tasks of VidEgoThink benchmark to comprehensively assess the egocentric video understanding capabilities in Embodied AI. There are four types of tasks, including video question answering, hierarchy planning, visual grounding, and reward modeling. These four tasks are complementary to each other to implement a complete goal for Embodied AI. </p>
        </div>
      <!-- </div> -->
    <!-- </div> -->
  </div>
</section>

<section class="section">
  <div class="container" style="margin-bottom: 2vh;">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">🔔News</h2>
        <div class="content has-text-justified">
          <p>
            <b>[2024-10]: <a href="https://huggingface.co/papers/2410.11623">VidEgoThink</a> is the Top-1 paper of Oct-17 in Hugging Face. 🔥</b><br>
            <b>[2024-10]: Our paper <a href="https://arxiv.org/abs/2410.11623">VidEgoThink: Assessing Egocentric Video Understanding Capabilities for Embodied AI</a> has been released.</b><br>
            <b>[2024-09]: EgoThink and VidEgoThink is invited to be presented in <a href="https://zhidx.com/p/441426.html">ZhiDX</a>.</b><br>
          </p>
      </div>      
        <h2 class="title is-3">Abstraction</h2>
        <div class="content has-text-justified">
          <p>
            Recent advancements in Multi-modal Large Language Models (MLLMs) have opened new avenues for applications in Embodied AI. Building on previous work, EgoThink, we introduce VidEgoThink, a comprehensive benchmark for evaluating egocentric video understanding capabilities. To bridge the gap between MLLMs and low-level control in Embodied AI, we design four key interrelated tasks: video question-answering, hierarchy planning, visual grounding and reward modeling. To minimize manual annotation costs, we develop an automatic data generation pipeline based on the Ego4D dataset, leveraging the prior knowledge and multimodal capabilities of GPT-4o. Three human annotators then filter the generated data to ensure diversity and quality, resulting in the VidEgoThink benchmark. We conduct extensive experiments with three types of models: API-based MLLMs, open-source image-based MLLMs, and open-source video-based MLLMs. Experimental results indicate that all MLLMs, including GPT-4o, perform poorly across all tasks related to egocentric video understanding. These findings suggest that foundation models still require significant advancements to be effectively applied to first-person scenarios in Embodied AI. In conclusion, VidEgoThink reflects a research trend towards employing MLLMs for egocentric vision, akin to human capabilities, enabling active observation and interaction in the complex real-world environments.
          </p>
        </div>
      </div>
    </div>
    <!--/ Abstract. -->
</div>
</section>

<!-- DATASET SECTION -->
<section class="hero is-light is-small">
  <div class="hero-body has-text-centered">
  <h1 class="title is-1 mmmu">
    <!-- <img src="static/images/VidEgoThink_icon.png" style="width:1em;vertical-align: middle" alt="Logo"/> -->
    <span class="mmmu" style="vertical-align: middle">Background</span>
  </h1>
  </div>
</section>

<section class="section">
  <div class="container">
    <div class="columns is-centered has-text-centered">
      <!-- <div class="column is-full-width has-text-centered"> -->
      <div class="column is-four-fifths">
        <!-- <h2 class="title is-3">Comparisons with Existing Benchmarks</h2> -->
        <div class="content has-text-justified">
          <p>
            Various egocentric benchmarks have emerged to evaluate the capabilities of MLLMs from a first-person perspective. However, the absence of a <b>comprehensive</b> video benchmark from the egocentric perspective presents a significant challenge to the development of general foundation models. Furthermore, current benchmarks, both in task design and textual output forms, focus on traditional video question-answering settings and neglect the potential to support downstream applications in Embodied AI, such as glass devices or autonomous robots. Therefore, it is crucial to design <b>suitable task formats that can be effectively applied to downstream applications in Embodied AI</b>.
        </p>
        <div class="content has-text-centered">
          <img src="static/images/comparison.png" alt="algebraic reasoning" class="center">
          <p><b><i>Table 1:</i></b> omparison of recent evaluation benchmarks of multimodal large language models and our proposed benchmark VidEgoThink. VQA/HP/VG/RM indicate visual question answering, hierarchy planning, visual grounding, and reward modeling. Existing/Handcraft/Automatic denote the way of collecting data, including existing dataset, manual annotation, and automatic generation.</p>
        </div>
        </div>
    </div>
    </div>
  </div>
</section>

<section class="hero is-light is-small">
  <div class="hero-body has-text-centered">
  <h1 class="title is-1 mmmu">
    <img src="static/images/VidEgoThink_icon.png" style="width:1em;vertical-align: middle" alt="Logo"/>
    <span class="mmmu" style="vertical-align: middle">VidEgoThink Benchmark</span>
  </h1>
  </div>
</section>

            
<section class="section">
  <div class="container">

    <div class="columns is-centered has-text-centered">
      <!-- <div class="column is-full-width has-text-centered"> -->
      <div class="column is-four-fifths">
        <div class="content has-text-justified">
          <p>
            Given that the utilization of foundation models in Embodied AI remains an open research question, we carefully design four types of interrelated tasks for comprehensive assessment: (i) video question-answering, (ii) hierarchy planning, (iii) visual grounding, (iv) reward modeling.
          </p>
        </div>
      </div>
    </div>

    <div class="columns is-centered has-text-centered">
      <!-- <div class="column is-full-width has-text-centered"> -->
      <div class="column is-four-fifths">
        <h2 class="title is-3">1. Video Question Answering</h2>
        <div class="content has-text-justified">
          <p>
            Previous evaluation studies on egocentric vision have predominantly focused on static images, constrained by the input format limitations of earlier MLLMs. However, recent advancements in API-based and video-based MLLMs have demonstrated significant progress. Since our real world is inherently dynamic and humans frequently process substantial amounts of video data, it is crucial to evaluate the video understanding capabilities of MLLMs. Considering the essential abilities for observing and interacting with the real world from a first-person perspective, we decompose the content of video modalities around “myself” into three main elements: object, action, and scene. Furthermore, we explore a series of fine-grained dimensions from these elements.
          </p>
          <img src="static/images/vqa.png" alt="algebraic reasoning" class="center">
          <div class="content has-text-centered">
            <p><b><i>Figure 2:</i></b> Case of video question answering.</p>
          </div>
        </div>
      </div>
    </div>

    <div class="columns is-centered has-text-centered">
      <!-- <div class="column is-full-width has-text-centered"> -->
      <div class="column is-four-fifths">
        <h2 class="title is-3">2. Hierarchy Planning</h2>
        <div class="content has-text-justified">
          <p>
            Recently, a hierarchy planning framework has been proposed to combine the advantages of foundation models and traditional methods in Embodied AI. In detail, foundation models are used as the planner to decompose high-level task instructions (e.g., “cook salmon”) into either mid-level steps (e.g., “# put salmon in the microwave’) or low-level atomic actions (e.g., “find(microwave)”), which is much more convenient for controlling. Therefore, we design two types of planning tasks: high-level goal to mid-level step (High-to-Mid), and mid-level step to low-level action (Mid-to-Low).
          </p>
          <img src="static/images/hp.png" alt="algebraic reasoning" class="center">
          <div class="content has-text-centered">
            <p><b><i>Figure 3:</i></b> Case of hierarchy planning.</p>
          </div>
        </div>
      </div>
    </div>

    <div class="columns is-centered has-text-centered">
      <!-- <div class="column is-full-width has-text-centered"> -->
      <div class="column is-four-fifths">
        <h2 class="title is-3">3. Visual Grounding</h2>
        <div class="content has-text-justified">
          <p>
            While natural language is effective for human communication, it cannot be directly translated into low-level actions or grounded in the real world. Consequently, visual grounding has garnered significant attention in both image- and video-based MLLMs. This task requires models to ground complex natural language descriptions or instructions in an image or video and output the corresponding pixel-level bounding boxes, masks, or frames. The bounding boxes and masks can directly identify actionable objects, while the frames can provide sufficient spatial or temporal information for downstream tasks. Therefore, we specifically design three tasks for different situations: object grounding, frame grounding, and temporal grounding.
          </p>
          <img src="static/images/vg.png" alt="algebraic reasoning" class="center">
          <div class="content has-text-centered">
            <p><b><i>Figure 4:</i></b> Case of visual grounding.</p>
          </div>
        </div>
      </div>
    </div>

    <div class="columns is-centered has-text-centered">
      <!-- <div class="column is-full-width has-text-centered"> -->
      <div class="column is-four-fifths">
        <h2 class="title is-3">4. Reward Modeling</h2>
        <div class="content has-text-justified">
          <p>
            In Embodied AI, manually designing reward functions to supervise actions is challenging due to the need for accuracy and diversity, especially for human activities. Benefiting from the large-scale Internet training corpus, foundation models can serve as reward models with built-in commonsense and reasoning capabilities. As a reward model, MLLMs should first observe the video to determine the completion status of the target motion. If the action is not completed, the reward model should further provide fine-grained feedback to help achieve the goal. Hence, we specifically design two types of tasks: critique, and feedback.
          </p>
          <img src="static/images/rm.png" alt="algebraic reasoning" class="center">
          <div class="content has-text-centered">
            <p><b><i>Figure 5:</i></b> Case of reward modeling.</p>
          </div>
        </div>
      </div>
    </div>

    <div class="columns is-centered m-6">
      <div class="column is-full has-text-centered content">
        <h2 class="title is-3">Statistics</h2>
        <!-- <div id="results-carousel" class="carousel results-carousel"> -->
        <div class="content has-text-centered">
          <img src="static/images/statistics.png" alt="algebraic reasoning" width="75%"/>
          <p><b><i>Table 2:</i></b> The statistics of videos across different benchmarks. Duration denotes the average time duration in second of all videos. LenQ and LenA indicate that the average length of questions and answers in the word level. TypeQ denotes the type of questions.</p>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- RESULTS SECTION -->
<section class="hero is-light is-small">
  <div class="hero-body has-text-centered">
    <h1 class="title is-1 mmmu" id="leaderboard">Leaderboard</h1>
  </div>
</section>


<!-------------------------------------------------------------------- RESULTS SECTION -------------------------------------------------------------------->
<section class="section">
  <div class="container">
    <!-- <div class="columns is-centered has-text-centered"> -->
      <!-- <div class="column is-full-width has-text-centered"> -->
      <!-- <div class="column is-four-fifths"> -->
        <!-- <h2 class="title is-3">Vision-Language Models</h2> -->
        <!-- <div class="content has-text-centered"> -->
          <!-- <img src="static/images/vlms.png" alt="algebraic reasoning" class="center"> -->
          <!-- <p><b><i>Table 3:</i></b> Statistics of compared API-based and open-source VLMs, where TTP and ToP indicate Total Trainable Parameters and Total Parameters, respectively. Moreover, EgoData and Video indicate that there are egocentric visual data and video data for training, respectively.</p> -->
        <!-- </div> -->
      <!-- </div> -->
    <!-- </div> -->
  
    <div class="columns is-centered has-text-centered">
      <!-- <div class="column is-full-width has-text-centered"> -->
      <div class="column is-four-fifths">
        <h2 class="title is-3">Video Question Answering</h2>
        <div class="content has-text-justified">
          <img src="static/images/result_vqa.png" alt="algebraic reasoning" class="center">
          <div class="content has-text-centered">
            <p><b><i>Table 3:</i></b> Experimental results of video question answering. OE, OO, OI, OC, OS, OP denote object existence, object order, object interaction, object count, object state, object prediction. AE, AS, AC indicates action existence, action sequence, action count. SE, ST, SP denote scene existence, scene transition, scene prediction. The bold font denotes the best performance and the underline font denotes the second-best performance.</p>
          </div>
        </div>
      </div>
    </div>

    <div class="columns is-centered has-text-centered">
      <!-- <div class="column is-full-width has-text-centered"> -->
      <div class="column is-four-fifths">
        <h2 class="title is-3">All Tasks</h2>
        <div class="content has-text-justified">
          <img src="static/images/result_all.png" alt="algebraic reasoning" class="center">
          <div class="content has-text-centered">
            <p><b><i>Table 4:</i></b> Experimental results of video question answerng, hierarchy planning, visual grounding, and reward modeling tasks. The bold font denotes the best performance and the underline font denotes the second-best performance.</p>
          </div>
        </div>
      </div>
    </div>

    <!-- <div class="columns is-centered m-6"> -->
      <!-- <div class="column is-full has-text-centered content"> -->
        <!-- <h2 class="title is-3" id="leaderboard">Leaderboard</h2> -->
        <!-- <div class="content"> -->
          <!-- <div class="content has-text-justified"> -->
            <!-- <p> -->
              <!-- Evaluating open-ended model generations is a non-trivial problem. To address this, we propose using GPT-4 as an automatic evaluator to better measure the generated answers. We continuously update the results of recent VLMs to ensure the effectiveness of EgoThink. Feel free to contribute to the performance of your model by adding it to our <a href="https://github.com/AdaCheng/EgoThink/blob/main/index.html">index.html</a>; we will review and merge it accordingly. -->
            <!-- </p> -->
          <!-- </div> -->

          <!-- <button id="toggleButton" onclick="changeButtonText()">Switch to Validation Set</button> -->
          <!-- <table id="table1" class="js-sort-table">
            <tr>
              <td class="js-sort-number"><strong>Reset</strong></td>
              <td class="js-sort-number"><strong>Average</strong></td>
              <td class="js-sort-number"><strong>Exist</strong></td>
              <td class="js-sort-number"><strong>Attr</strong></td>
              <td class="js-sort-number"><strong>Afford</strong></td>
              <td class="js-sort-number"><strong>Activity</strong></td>
              <td class="js-sort-number"><strong>Loc</strong></td>
              <td class="js-sort-number"><strong>Spatial</strong></td>
              <td class="js-sort-number"><strong>Count</strong></td>
              <td class="js-sort-number"><strong>Compar</strong></td>
              <td class="js-sort-number"><strong>Situtaed</strong></td>
              <td class="js-sort-number"><strong>Forecasting</strong></td>
              <td class="js-sort-number"><strong>Nav</strong></td>
              <td class="js-sort-number"><strong>Assist</strong></td>
            </tr>
            <tr style="background-color: #f8fffe;">
              <td style="text-align: left;"><b>GPT-4V(ision)</b></td>
              <td><b>65.5</b></td>
              <td>62.0</td>
              <td><b>82.0</b></td>
              <td><b>58.0</b></td>
              <td><b>59.5</b></td>
              <td style="text-decoration: underline;">86.0</td>
              <td style="text-decoration: underline;">62.0</td>
              <td><b>42.0</b></td>
              <td>48.0</td>
              <td><b>83.0</b></td>
              <td><b>55.0</b></td>
              <td><b>64.0</b></td>
              <td><b>84.0</b></td>
            </tr>  
            <tr style="background-color: #f9f2f8;">
              <td style="text-align: left;"><b>OpenFlamingo-7B</b></td>
              <td>27.2</td>
              <td>16.0</td>
              <td>55.0</td>
              <td>37.0</td>
              <td>15.0</td>
              <td>34.0</td>
              <td>34.0</td>
              <td>21.0</td>
              <td>40.0</td>
              <td>21.0</td>
              <td>31.0</td>
              <td>11.0</td>
              <td>11.0</td>
            </tr>
            <tr style="background-color: #f9f2f8;">
              <td style="text-align: left;"><b>BLIP-2-6.7B</b></td>
              <td>28.1</td>
              <td>49.0</td>
              <td>29.0</td>
              <td>39.0</td>
              <td>33.5</td>
              <td>60.0</td>
              <td>31.0</td>
              <td>3.0</td>
              <td>21.0</td>
              <td>33.0</td>
              <td>25.0</td>
              <td>8.0</td>
              <td>6.0</td>
            </tr>
            <tr style="background-color: #f9f2f8;">
              <td style="text-align: left;"><b>LLaVA-1.5-7B</b></td>
              <td>39.0</td>
              <td>33.0</td>
              <td>47.0</td>
              <td style="text-decoration: underline;">54.0</td>
              <td>35.5</td>
              <td>35.0</td>
              <td>49.0</td>
              <td>20.0</td>
              <td>47.0</td>
              <td>37.0</td>
              <td>27.0</td>
              <td>29.0</td>
              <td>54.0</td>
            </tr>
            <tr style="background-color: #f9f2f8;">
              <td style="text-align: left;"><b>MiniGPT-4-7B</b></td>
              <td>40.6</td>
              <td>50.0</td>
              <td>56.0</td>
              <td>46.0</td>
              <td>39.0</td>
              <td>55.0</td>
              <td>49.0</td>
              <td>14.0</td>
              <td>48.0</td>
              <td>31.0</td>
              <td>41.5</td>
              <td>14.0</td>
              <td>44.0</td>
            </tr>
            <tr style="background-color: #f9f2f8;">
              <td style="text-align: left;"><b>InstructBLIP-7B</b></td>
              <td>42.4</td>
              <td>50.0</td>
              <td>33.0</td>
              <td>45.0</td>
              <td>47.5</td>
              <td>77.0</td>
              <td>38.0</td>
              <td>18.0</td>
              <td>43.0</td>
              <td>67.0</td>
              <td>40.5</td>
              <td>19.0</td>
              <td>31.0</td>
            </tr>
            <tr style="background-color: #f9f2f8;">
              <td style="text-align: left;"><b>LLaMA-Adapter-7B</b></td>
              <td>42.5</td>
              <td>37.0</td>
              <td>60.0</td>
              <td>46.0</td>
              <td>34.5</td>
              <td>48.0</td>
              <td>51.0</td>
              <td>29.0</td>
              <td>39.0</td>
              <td>25.0</td>
              <td>41.5</td>
              <td>42.0</td>
              <td>57.0</td>
            </tr>
            <tr style="background-color: #f9f2f8;">
              <td style="text-align: left;"><b>Otter-I-7B</b></td>
              <td>45.3</td>
              <td>48.0</td>
              <td>56.0</td>
              <td>39.0</td>
              <td>44.0</td>
              <td>60.0</td>
              <td>44.0</td>
              <td>39.0</td>
              <td>48.0</td>
              <td>42.0</td>
              <td>38.0</td>
              <td>31.0</td>
              <td>55.0</td>
            </tr>
            <tr style="background-color: #f9f2f8;">
              <td style="text-align: left;"><b>PandaGPT-7B</b></td>
              <td>46.2</td>
              <td>40.0</td>
              <td>56.0</td>
              <td>41.0</td>
              <td>37.0</td>
              <td>61.0</td>
              <td>52.0</td>
              <td>19.0</td>
              <td style="text-decoration: underline;">52.0</td>
              <td>53.0</td>
              <td>43.0</td>
              <td>39.0</td>
              <td>61.0</td>
            </tr>
            <tr style="background-color: #f9f2f8;">
              <td style="text-align: left;"><b>mPLUG-owl-7B</b></td>
              <td>48.8</td>
              <td>56.0</td>
              <td>58.0</td>
              <td>47.0</td>
              <td>53.0</td>
              <td>60.0</td>
              <td>53.0</td>
              <td>25.0</td>
              <td>49.0</td>
              <td>44.0</td>
              <td>49.5</td>
              <td>33.0</td>
              <td>58.0</td>
            </tr>
            <tr style="background-color: #f9f2f8;">
              <td style="text-align: left;"><b>LLaVA-7B</b></td>
              <td>49.6</td>
              <td>63.0</td>
              <td>58.0</td>
              <td>50.0</td>
              <td>47.0</td>
              <td>81.0</td>
              <td>45.0</td>
              <td>24.0</td>
              <td>36.0</td>
              <td>47.0</td>
              <td>49.5</td>
              <td>35.0</td>
              <td>60.0</td>
            </tr>
            <tr style="background-color: #f4f9fe;">
              <td style="text-align: left;"><b>InstructBLIP-13B</b></td>
              <td>42.8</td>
              <td>52.0</td>
              <td>55.0</td>
              <td>49.0</td>
              <td>54.0</td>
              <td>63.0</td>
              <td>49.0</td>
              <td>11.0</td>
              <td>33.0</td>
              <td>59.0</td>
              <td>44.0</td>
              <td>19.0</td>
              <td>25.0</td>
            </tr>
            <tr style="background-color: #f4f9fe;">
              <td style="text-align: left;"><b>PandaGPT-13B</b></td>
              <td>43.1</td>
              <td>35.0</td>
              <td>52.0</td>
              <td>41.0</td>
              <td>40.5</td>
              <td>68.0</td>
              <td>31.0</td>
              <td>32.0</td>
              <td>40.0</td>
              <td>47.0</td>
              <td>45.5</td>
              <td>16.0</td>
              <td>69.0</td>
            </tr>
            <tr style="background-color: #f4f9fe;">
              <td style="text-align: left;"><b>LLaVA-13B-Vicuna</b></td>
              <td>46.4</td>
              <td>54.0</td>
              <td>62.0</td>
              <td>52.0</td>
              <td>46.0</td>
              <td>53.0</td>
              <td>46.0</td>
              <td>26.0</td>
              <td>44.0</td>
              <td>29.0</td>
              <td>44.0</td>
              <td>35.0</td>
              <td>66.0</td>
            </tr>       
            <tr style="background-color: #f4f9fe;">
              <td style="text-align: left;"><b>BLIP-2-11B</b></td>
              <td>49.6</td>
              <td>52.0</td>
              <td>62.0</td>
              <td>41.0</td>
              <td>49.5</td>
              <td><b>90.0</b></td>
              <td><b>66.0</b></td>
              <td>25.0</td>
              <td>50.0</td>
              <td>70.0</td>
              <td>48.0</td>
              <td>18.0</td>
              <td>24.0</td>
            </tr>       
            <tr style="background-color: #f4f9fe;">
              <td style="text-align: left;"><b>InstructBLIP-11B</b></td>
              <td>51.1</td>
              <td><b>74.0</b></td>
              <td style="text-decoration: underline;">68.0</td>
              <td>48.0</td>
              <td>49.5</td>
              <td style="text-decoration: underline;">86.0</td>
              <td>52.0</td>
              <td>32.0</td>
              <td>49.0</td>
              <td style="text-decoration: underline;">73.0</td>
              <td style="text-decoration: underline;">53.0</td>
              <td>16.0</td>
              <td>17.0</td>
            </tr>       
            <tr style="background-color: #f4f9fe;">
              <td style="text-align: left;"><b>LLaVA-13B-Llama2</b></td>
              <td>55.1</td>
              <td>65.0</td>
              <td>61.0</td>
              <td>45.0</td>
              <td style="text-decoration: underline;">56.0</td>
              <td>77.0</td>
              <td>53.0</td>
              <td style="text-decoration: underline;">34.0</td>
              <td>34.0</td>
              <td>66.0</td>
              <td>50.5</td>
              <td style="text-decoration: underline;">49.0</td>
              <td style="text-decoration: underline;">71.0</td>
            </tr> 
            <tr style="background-color: #f4f9fe;">
              <td style="text-align: left;"><b>LLaVA-1.5-13B</b></td>
              <td style="text-decoration: underline;">55.3</td>
              <td style="text-decoration: underline;">66.0</td>
              <td>55.0</td>
              <td>51.0</td>
              <td>55.0</td>
              <td>82.0</td>
              <td>57.0</td>
              <td>32.0</td>
              <td><b>56.0</b></td>
              <td>67.0</td>
              <td>48.5</td>
              <td>39.0</td>
              <td>55.0</td>
            </tr> 
          </table>

          <p> <b><i>Table 4:</i></b> Combined single-answer grading scores on zero-shot setups for various dimensions. The <b>bold</b> indicates the best performance while the <u>underline</u> indicates the second-best performance. Exist, Attr, Afford, Loc, Spatial, Count, Compar, Situated, Nav and Assist represent existence, attribute, affordance, location, spatial relationship, counting, comparison, situated reasoning, navigation, and assistance.</p> -->
      <!-- </div> -->
    <!-- </div>  -->
  </div>
</section>

<!-------------------------------------------------------------------- RESULTS SECTION -------------------------------------------------------------------->

<!-- @PAN TODO: bibtex -->
<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title is-3 has-text-centered">BibTeX</h2>
    <pre><code>
      @article{cheng2024videgothink,
        title={VidEgoThink: Assessing Egocentric Video Understanding Capabilities for Embodied AI},
        author={Cheng, Sijie and Fang, Kechen and Yu, Yangyang and Zhou, Sicheng and Li, Bohao and Tian, Ye and Li, Tingguang and Han, Lei and Liu, Yang},
        journal={arXiv preprint arXiv:2410.11623},
        year={2024}
      }
</code></pre>
  </div>
</section>

<footer class="footer">
  <!-- <div class="container"> -->
    <div class="content has-text-centered">
    </div>
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content has-text-centered">
          <p>
            This website is website adapted from <a href="https://adacheng.github.io/EgoThink/">EgoThink</a>, licensed under a <a rel="license"
                                                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>
        </div>
      </div>
    </div>
  <!-- </div> -->

</footer>


<style>
  .hidden {
      display: none;
  }
  .sortable:hover {
      cursor: pointer;
  }
  .asc::after {
      content: ' ↑';
  }
  .desc::after {
      content: ' ↓';
  }
  #toggleButton {
    background-color: #ffffff;
    border: 1px solid #dddddd;
    color: #555555;
    padding: 10px 20px;
    text-align: center;
    text-decoration: none;
    display: inline-block;
    font-size: 14px;
    margin: 4px 2px;
    cursor: pointer;
    border-radius: 25px; 
    box-shadow: 0 4px 8px 0 rgba(0,0,0,0.2);
    transition-duration: 0.4s;
  }

  #toggleButton:hover {
    box-shadow: 0 12px 16px 0 rgba(0,0,0,0.24), 0 17px 50px 0 rgba(0,0,0,0.19); /* 鼠标悬停时的阴影效果 */
  }

  table {
    border-collapse: collapse;
    width: 100%;
    margin-top: 5px;
    border: 1px solid #ddd;
    font-size: 14px;
  }

  th, td {
      text-align: left;
      padding: 8px;
  }

  th {
      background-color: #f2f2f2;
      border-bottom: 2px solid #ddd;
  }

  td:hover {background-color: #ffffff;}
</style>

</body>
</html>