Skip to content

Commit 1244eee

Browse files
committed
update
1 parent a246224 commit 1244eee

File tree

3 files changed

+16
-0
lines changed

3 files changed

+16
-0
lines changed

_bibliography/papers.bib

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,21 @@
11
---
22
---
3+
@inproceedings{Chen2024MEGABenchSM,
4+
title = "MEGA-Bench: Scaling Multimodal Evaluation to over 500 Real-World Tasks",
5+
author = {Jiacheng Chen and Tianhao Liang and Sherman Siu and Zhengqing Wang and Kai Wang and Yubo Wang and Yuansheng Ni and Wang Zhu and Ziyan Jiang and Bohan Lyu and Dongfu Jiang and Xuan He and Yuan-Ting Liu and Hexiang Hu and Xiang Yue and Wenhu Chen},
6+
booktitle = "arxiv preprint",
7+
arxiv = "2410.10563",
8+
abstract = "We present MEGA-Bench, an evaluation suite that scales multimodal evaluation to over 500 real-world tasks, to address the highly heterogeneous daily use cases of end users. Our objective is to optimize for a set of high-quality data samples that cover a highly diverse and rich set of multimodal tasks, while enabling cost-effective and accurate model evaluation. In particular, we collected 505 realistic tasks encompassing over 8,000 samples from 16 expert annotators to extensively cover the multimodal task space. Instead of unifying these problems into standard multi-choice questions (like MMMU, MMBench, and MMT-Bench), we embrace a wide range of output formats like numbers, phrases, code, \LaTeX, coordinates, JSON, free-form, etc. To accommodate these formats, we developed over 40 metrics to evaluate these tasks. Unlike existing benchmarks, MEGA-Bench offers a fine-grained capability report across multiple dimensions (e.g., application, input type, output format, skill), allowing users to interact with and visualize model capabilities in depth. We evaluate a wide variety of frontier vision-language models on MEGA-Bench to understand their capabilities across these dimensions.",
9+
preview={megabench_preview.png},
10+
website = "https://tiger-ai-lab.github.io/MEGA-Bench/",
11+
github = "TIGER-AI-Lab/MEGA-Bench",
12+
huggingface="https://huggingface.co/spaces/TIGER-Lab/MEGA-Bench",
13+
twitter="https://x.com/WenhuChen/status/1846692920117678384",
14+
selected = false,
15+
num_co_first_author = 3,
16+
abbr={Arxiv},
17+
bibtex_show={true},
18+
}
319

420
@inproceedings{he2024videoscore,
521
title = "VideoScore: Building Automatic Metrics to Simulate Fine-grained Human Feedback for Video Generation",
1.92 MB
Loading

assets/pdf/Dongfu_CV.pdf

390 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)