Skip to content

Commit fe297e6

Browse files
committed
update
1 parent cea1d98 commit fe297e6

File tree

1 file changed

+4
-6
lines changed

1 file changed

+4
-6
lines changed

_bibliography/papers.bib

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,21 +62,19 @@ @inproceedings{Jiang2024GenAIAA
6262
num_co_first_author = {3},
6363
bibtex_show={true},
6464
}
65-
66-
@article{jiang2024mantis,
65+
@article{Jiang2024MANTISIM,
6766
title={MANTIS: Interleaved Multi-Image Instruction Tuning},
68-
author={Dongfu Jiang and Xuan He and Huaye Zeng and Cong Wei and Max Ku and Qian Liu and Wenhu Chen},
67+
author={Dongfu Jiang and Xuan He and Huaye Zeng and Cong Wei and Max W.F. Ku and Qian Liu and Wenhu Chen},
6968
abstract={Large multimodal models (LMMs) have shown great results in single-image vision language tasks. However, their abilities to solve multi-image visual language tasks is yet to be improved. The existing LMMs like OpenFlamingo, Emu2, Idefics gain their multi-image ability through pre-training on hundreds of millions of noisy interleaved image-text data from the web, which is neither efficient nor effective. In this paper, we aim to build strong multi-image LMMs via instruction tuning with academic-level resources. Therefore, we meticulously construct Mantis-Instruct containing 721K multi-image instruction data to train a family of models Mantis. The instruction tuning empowers Mantis with different multi-image skills like co-reference, comparison, reasoning, and temporal understanding. We evaluate Mantis on five multi-image benchmarks and seven single-image benchmarks. Mantis-SigLIP can achieve SoTA results on all the multi-image benchmarks and beat the strongest multi-image baseline, Idefics2-8B by an average of 11 absolute points. Notably, Idefics2-8B was pre-trained on 140M interleaved multi-image data, which is 200x larger than Mantis-Instruct. We observe that Mantis performs equivalently well on the held-in and held-out benchmarks, which shows its generalization ability. Notably, we found that Mantis can even match the performance of GPT-4V on multi-image benchmarks. We further evaluate Mantis on single-image benchmarks and demonstrate that Mantis also maintains a strong single-image performance on par with CogVLM and Emu2. Our results show that multi-image abilities are not necessarily gained through massive pre-training, instead, it can be gained by the low-cost instruction tuning. Our work provides new perspectives on how to improve LMMs' multi-image abilities.},
70-
journal={ArXiv},
71-
booltitle={ArXiv},
69+
journal={Transactions on Machine Learning Research},
7270
year={2024},
7371
eprint={2405.01483},
7472
archivePrefix={arXiv},
7573
primaryClass={cs.CV},
7674
url={https://arxiv.org/abs/2405.01483},
7775
website = "https://tiger-ai-lab.github.io/Mantis/",
7876
github = "TIGER-AI-Lab/Mantis",
79-
abbr={arXiv},
77+
abbr={TMLR 2024},
8078
preview={mantis_preview.png},
8179
arxiv={2405.01483},
8280
twitter = "https://twitter.com/DongfuJiang/status/1786552974598078677",

0 commit comments

Comments
 (0)