update

jdf-prog · jdf-prog · commit fe297e6025e2 · 2024-11-15T01:46:10.000-05:00
diff --git a/_bibliography/papers.bib b/_bibliography/papers.bib
@@ -62,21 +62,19 @@ @inproceedings{Jiang2024GenAIAA
   num_co_first_author = {3},
   bibtex_show={true},
 }
-
-@article{jiang2024mantis,
+@article{Jiang2024MANTISIM,
   title={MANTIS: Interleaved Multi-Image Instruction Tuning}, 
-  author={Dongfu Jiang and Xuan He and Huaye Zeng and Cong Wei and Max Ku and Qian Liu and Wenhu Chen},
+  author={Dongfu Jiang and Xuan He and Huaye Zeng and Cong Wei and Max W.F. Ku and Qian Liu and Wenhu Chen},
   abstract={Large multimodal models (LMMs) have shown great results in single-image vision language tasks. However, their abilities to solve multi-image visual language tasks is yet to be improved. The existing LMMs like OpenFlamingo, Emu2, Idefics gain their multi-image ability through pre-training on hundreds of millions of noisy interleaved image-text data from the web, which is neither efficient nor effective. In this paper, we aim to build strong multi-image LMMs via instruction tuning with academic-level resources. Therefore, we meticulously construct Mantis-Instruct containing 721K multi-image instruction data to train a family of models Mantis. The instruction tuning empowers Mantis with different multi-image skills like co-reference, comparison, reasoning, and temporal understanding. We evaluate Mantis on five multi-image benchmarks and seven single-image benchmarks. Mantis-SigLIP can achieve SoTA results on all the multi-image benchmarks and beat the strongest multi-image baseline, Idefics2-8B by an average of 11 absolute points. Notably, Idefics2-8B was pre-trained on 140M interleaved multi-image data, which is 200x larger than Mantis-Instruct. We observe that Mantis performs equivalently well on the held-in and held-out benchmarks, which shows its generalization ability. Notably, we found that Mantis can even match the performance of GPT-4V on multi-image benchmarks. We further evaluate Mantis on single-image benchmarks and demonstrate that Mantis also maintains a strong single-image performance on par with CogVLM and Emu2. Our results show that multi-image abilities are not necessarily gained through massive pre-training, instead, it can be gained by the low-cost instruction tuning. Our work provides new perspectives on how to improve LMMs' multi-image abilities.},
-  journal={ArXiv},
-  booltitle={ArXiv},
+  journal={Transactions on Machine Learning Research},
   year={2024},
   eprint={2405.01483},
   archivePrefix={arXiv},
   primaryClass={cs.CV},
   url={https://arxiv.org/abs/2405.01483},
   website = "https://tiger-ai-lab.github.io/Mantis/",
   github = "TIGER-AI-Lab/Mantis",
-  abbr={arXiv},
+  abbr={TMLR 2024},
   preview={mantis_preview.png},
   arxiv={2405.01483},
   twitter = "https://twitter.com/DongfuJiang/status/1786552974598078677",