We present GROUNDHOG , a multimodal large language model developed by grounding large language models to holistic segmentation. GROUNDHOG is flexible and diagnosable, reduces object hallucination, and can plug in and play with any segmentation foundation model (e.g., SAM).
@inproceedings{zhang2024groundhog,
title={GROUNDHOG: Grounding Large Language Models to Holistic Segmentation},
author={Zhang, Yichi and Ma, Ziqiao and Gao, Xiaofeng and Shakiah, Suhaila and Gao, Qiaozi and Chai, Joyce},
booktitle={Conference on Computer Vision and Pattern Recognition 2024},
year={2024}
}