@inproceedings{bb231400, AUTHOR = "Zhang, Y.C. and Qian, S.J. and Peng, B. and Liu, S. and Jia, J.Y.", TITLE = "Prompt Highlighter: Interactive Control for Multi-Modal LLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13215-13224", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226380"} @inproceedings{bb231401, AUTHOR = "Kaul, P. and Li, Z.Z. and Yang, H. and Dukler, Y. and Swaminathan, A. and Taylor, C.J. and Soatto, S.", TITLE = "THRONE: An Object-Based Hallucination Benchmark for the Free-Form Generations of Large Vision-Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27218-27228", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226381"} @inproceedings{bb231402, AUTHOR = "Wang, D.K. and Xuan, S.Y. and Zhang, S.L.", TITLE = "LocLLM: Exploiting Generalizable Human Keypoint Localization via Large Language Model", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "614-623", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226382"} @inproceedings{bb231403, AUTHOR = "Liu, H.C. and Zhan, X.H. and Huang, S.L. and Mu, T.J. and Shan, Y.", TITLE = "Programmable Motion Generation for Open-Set Motion Control Tasks", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "1399-1408", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226383"} @inproceedings{bb231404, AUTHOR = "Zhu, L. and Chen, T.R. and Ji, D. and Ye, J.P. and Liu, J.", TITLE = "LLaFS: When Large Language Models Meet Few-Shot Segmentation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "3065-3075", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226384"} @inproceedings{bb231405, AUTHOR = "Xia, Z.F. and Han, D.C. and Han, Y.Z. and Pan, X. and Song, S. and Huang, G.", TITLE = "GSVA: Generalized Segmentation via Multimodal Large Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "3858-3869", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226385"} @inproceedings{bb231406, AUTHOR = "Zhao, L. and Yang, Y. and Zhang, K. and Shao, W.Q. and Zhang, Y.X. and Qiao, Y. and Luo, P. and Ji, R.R.", TITLE = "DiffAgent: Fast and Accurate Text-to-Image API Selection with Large Language Model", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "6390-6399", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226386"} @inproceedings{bb231407, AUTHOR = "Yao, J. and Liu, Y.J. and Dong, Z. and Guo, M.F. and Hu, H. and Keutzer, K. and Du, L. and Zhou, D. and Zhang, S.H.", TITLE = "PromptCoT: Align Prompt Distribution via Adapted Chain-of-Thought", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "7027-7037", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226387"} @inproceedings{bb231408, AUTHOR = "Cai, Z.P. and Mueller, M. and Birkl, R. and Wofk, D. and Tseng, S.Y. and Cheng, J. and Stan, G.B.M. and Lai, V. and Paulitsch, M.", TITLE = "L-MAGIC: Language Model Assisted Generation of Images with Coherence", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "7049-7058", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226388"} @inproceedings{bb231409, AUTHOR = "Li, Y. and Liu, X. and Kag, A. and Hu, J. and Idelbayev, Y. and Sagar, D. and Wang, Y.Z. and Tulyakov, S. and Ren, J.", TITLE = "TextCraftor: Your Text Encoder can be Image Quality Controller", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "7985-7995", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226389"} @inproceedings{bb231410, AUTHOR = "Argaw, D.M. and Yoon, S.H. and Heilbron, F.C. and Deilamsalehy, H. and Bui, T. and Wang, Z.W. and Dernoncourt, F. and Chung, J.S.", TITLE = "Scaling Up Video Summarization Pretraining with Large Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "8332-8341", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226390"} @inproceedings{bb231411, AUTHOR = "Tong, S. and Liu, Z. and Zhai, Y.X. and Ma, Y. and LeCun, Y. and Xie, S.", TITLE = "Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "9568-9578", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226391"} @inproceedings{bb231412, AUTHOR = "Lai, X. and Tian, Z. and Chen, Y. and Li, Y.W. and Yuan, Y.H. and Liu, S. and Jia, J.Y.", TITLE = "LISA: Reasoning Segmentation via Large Language Model", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "9579-9589", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226392"} @inproceedings{bb231413, AUTHOR = "Shang, C. and Zhou, S. and Zhang, H. and Ni, X.Z. and Yang, Y. and Wang, Y.", TITLE = "Incremental Residual Concept Bottleneck Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "11030-11040", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226393"} @inproceedings{bb231414, AUTHOR = "Xie, Y.T. and Chen, Q. and Wang, S. and To, M.S. and Lee, I. and Khoo, E.W. and Hendy, K. and Koh, D. and Xia, Y. and Wu, Q.", TITLE = "PairAug: What Can Augmented Image-Text Pairs Do for Radiology?", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "11652-11661", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226394"} @inproceedings{bb231415, AUTHOR = "Dong, Z.K. and Liu, X. and Chen, B. and Polak, P. and Zhang, P.", TITLE = "MuseChat: A Conversational Music Recommendation System for Videos", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "12775-12785", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226395"} @inproceedings{bb231416, AUTHOR = "Li, F. and Jiang, Q. and Zhang, H. and Ren, T. and Liu, S.L. and Zou, X. and Xu, H.Z. and Li, H.Y. and Yang, J.W. and Li, C.Y. and Zhang, L. and Gao, J.F.", TITLE = "Visual in-Context Prompting", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "12861-12871", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226396"} @inproceedings{bb231417, AUTHOR = "Sachdeva, R. and Zisserman, A.", TITLE = "The Manga Whisperer: Automatically Generating Transcriptions for Comics", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "12967-12976", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226397"} @inproceedings{bb231418, AUTHOR = "Xu, J.R. and Zhou, X.Y. and Yan, S. and Gu, X. and Arnab, A. and Sun, C. and Wang, X.L. and Schmid, C.", TITLE = "Pixel Aligned Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13030-13039", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226398"} @inproceedings{bb231419, AUTHOR = "Ye, Q.H. and Xu, H.Y. and Ye, J. and Yan, M. and Hu, A. and Liu, H. and Qian, Q. and Zhang, J. and Huang, F.", TITLE = "mPLUG-OwI2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13040-13051", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226399"} @inproceedings{bb231420, AUTHOR = "Qi, P. and Yan, Z. and Hsu, W. and Lee, M.L.", TITLE = "Sniffer: Multimodal Large Language Model for Explainable Out-of-Context Misinformation Detection", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13052-13062", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226400"} @inproceedings{bb231421, AUTHOR = "Wu, P.H. and Xie, S.", TITLE = "V*: Guided Visual Search as a Core Mechanism in Multimodal LLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13084-13094", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226401"} @inproceedings{bb231422, AUTHOR = "He, R. and Cascante Bonilla, P. and Yang, Z.Y. and Berg, A.C. and Ordonez, V.", TITLE = "Improved Visual Grounding through Self-Consistent Explanations", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13095-13105", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226402"} @inproceedings{bb231423, AUTHOR = "Zhong, S.S. and Huang, Z.Z. and Gao, S. and Wen, W. and Lin, L. and Zitnik, M. and Zhou, P.", TITLE = "Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language Models with Creative Humor Generation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13246-13257", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226403"} @inproceedings{bb231424, AUTHOR = "Gao, Z. and Du, Y.T. and Zhang, X.T. and Ma, X.J. and Han, W.J. and Zhu, S.C. and Li, Q.", TITLE = "CLOVA: A Closed-LOop Visual Assistant with Tool Usage and Update", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13258-13268", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226404"} @inproceedings{bb231425, AUTHOR = "Feng, C. and Hsu, J. and Liu, W.Y. and Wu, J.J.", TITLE = "Naturally Supervised 3D Visual Grounding with Language-Regularized Concept Learners", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13269-13278", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226405"} @inproceedings{bb231426, AUTHOR = "Li, B. and Ge, Y.Y. and Ge, Y.X. and Wang, G.Z. and Wang, R. and Zhang, R.M. and Shan, Y.", TITLE = "SEED-Bench: Benchmarking Multimodal Large Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13299-13308", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226406"} @inproceedings{bb231427, AUTHOR = "Buettner, K. and Malakouti, S. and Li, X.L. and Kovashka, A.", TITLE = "Incorporating Geo-Diverse Knowledge into Prompting for Increased Geographical Robustness in Object Recognition", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13515-13524", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226407"} @inproceedings{bb231428, AUTHOR = "Liu, R. and Li, C. and Ge, Y.X. and Li, T.H. and Shan, Y. and Li, G.", TITLE = "BT-Adapter: Video Conversation is Feasible Without Video Instruction Tuning", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13658-13667", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226408"} @inproceedings{bb231429, AUTHOR = "Ding, X.P. and Han, J.H. and Xu, H. and Liang, X.D. and Zhang, W. and Li, X.M.", TITLE = "Holistic Autonomous Driving Understanding by Bird'View Injected Multi-Modal Large Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13668-13677", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226409"} @inproceedings{bb231430, AUTHOR = "Li, J.X. and Vo, D.M. and Sugimoto, A. and Nakayama, H.", TITLE = "Evcap: Retrieval-Augmented Image Captioning with External Visual-Name Memory for Open-World Comprehension", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13733-13742", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226410"} @inproceedings{bb231431, AUTHOR = "Song, L. and Chen, Y. and Yang, S. and Ding, X.H. and Ge, Y.X. and Chen, Y.C. and Shan, Y.", TITLE = "Low-Rank Approximation for Sparse Attention in Multi-Modal LLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13763-13773", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226411"} @inproceedings{bb231432, AUTHOR = "Guo, Q. and de Mello, S. and Yin, H.X. and Byeon, W. and Cheung, K.C. and Yu, Y.Z. and Luo, P. and Liu, S.", TITLE = "RegionGPT: Towards Region Understanding Vision Language Model", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13796-13806", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226412"} @inproceedings{bb231433, AUTHOR = "Yu, T.Y. and Yao, Y. and Zhang, H.Y. and He, T. and Han, Y.F. and Cui, G. and Hu, J.Y. and Liu, Z.Y. and Zheng, H.T. and Sun, M.", TITLE = "RLHF-V: Towards Trustworthy MLLMs via Behavior Alignment from Fine-Grained Correctional Human Feedback", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13807-13816", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226413"} @inproceedings{bb231434, AUTHOR = "Xuan, S.Y. and Guo, Q. and Yang, M. and Zhang, S.L.", TITLE = "Pink: Unveiling the Power of Referential Comprehension for Multi-modal LLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13838-13848", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226414"} @inproceedings{bb231435, AUTHOR = "He, J.W. and Wang, Y.F. and Wang, L.J. and Lu, H.C. and He, J.Y. and Lan, J.P. and Luo, B. and Xie, X.", TITLE = "Multi-Modal Instruction Tuned LLMs with Fine-Grained Visual Perception", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13980-13990", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226415"} @inproceedings{bb231436, AUTHOR = "Yu, Q. and Sun, Q. and Zhang, X.S. and Cui, Y.F. and Zhang, F. and Cao, Y. and Wang, X.L. and Liu, J.J.", TITLE = "CapsFusion: Rethinking Image-Text Data at Scale", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14022-14032", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226416"} @inproceedings{bb231437, AUTHOR = "Yao, J.W. and Qian, Q. and Hu, J.", TITLE = "Multi-Modal Proxy Learning Towards Personalized Visual Multiple Clustering", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14066-14075", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226417"} @inproceedings{bb231438, AUTHOR = "Zou, B. and Yang, C. and Qiao, Y. and Quan, C.B. and Zhao, Y.J.", TITLE = "LLaMA-Excitor: General Instruction Tuning via Indirect Feature Interaction", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14089-14099", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226418"} @inproceedings{bb231439, AUTHOR = "Huang, B. and Wang, X. and Chen, H. and Song, Z. and Zhu, W.W.", TITLE = "VTimeLLM: Empower LLM to Grasp Video Moments", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14271-14280", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226419"} @inproceedings{bb231440, AUTHOR = "Hong, W. and Wang, W.H. and Lv, Q.S. and Xu, J.Z. and Yu, W. and Ji, J.H. and Wang, Y. and Wang, Z. and Dong, Y.X. and Ding, M. and Tang, J.", TITLE = "CogAgent: A Visual Language Model for GUI Agents", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14281-14290", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226420"} @inproceedings{bb231441, AUTHOR = "Mitra, C. and Huang, B. and Darrell, T.J. and Herzig, R.", TITLE = "Compositional Chain-of-Thought Prompting for Large Multimodal Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14420-14431", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226421"} @inproceedings{bb231442, AUTHOR = "Li, B. and Wang, Y. and Mao, J. and Ivanovic, B. and Veer, S. and Leung, K. and Pavone, M.", TITLE = "Driving Everywhere with Large Language Model Policy Adaptation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14948-14957", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226422"} @inproceedings{bb231443, AUTHOR = "Wei, Y.X. and Wang, Z. and Lu, Y.F. and Xu, C.X. and Liu, C.X. and Zhao, H. and Chen, S. and Wang, Y.F.", TITLE = "Editable Scene Simulation for Autonomous Driving via Collaborative LLM-Agents", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15077-15087", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226423"} @inproceedings{bb231444, AUTHOR = "Shao, H. and Hu, Y.X. and Wang, L. and Song, G.L. and Waslander, S.L. and Liu, Y. and Li, H.S.", TITLE = "LMDrive: Closed-Loop End-to-End Driving with Large Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15120-15130", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226424"} @inproceedings{bb231445, AUTHOR = "Ma, Y.S. and Cui, C. and Cao, X. and Ye, W.Q. and Liu, P.R. and Lu, J. and Abdelraouf, A. and Gupta, R. and Han, K.T. and Bera, A. and Rehg, J.M. and Wang, Z.", TITLE = "LaMPilot: An Open Benchmark Dataset for Autonomous Driving with Language Model Programs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15141-15151", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226425"} @inproceedings{bb231446, AUTHOR = "Zhang, J.W. and Xu, C. and Li, B.", TITLE = "ChatScene: Knowledge-Enabled Safety-Critical Scenario Generation for Autonomous Vehicles", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15459-15469", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226426"} @inproceedings{bb231447, AUTHOR = "Liu, C. and Yin, K. and Cao, H.Y. and Jiang, X.H. and Li, X. and Liu, Y. and Jiang, D.Q. and Sun, X. and Xu, L.", TITLE = "HRVDA: High-Resolution Visual Document Assistant", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15534-15545", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226427"} @inproceedings{bb231448, AUTHOR = "Luo, C. and Shen, Y.F. and Zhu, Z.Q. and Zheng, Q. and Yu, Z. and Yao, C.", TITLE = "LayoutLLM: Layout Instruction Tuning with Large Language Models for Document Understanding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15630-15640", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226428"} @inproceedings{bb231449, AUTHOR = "Yang, Y. and Sun, F.Y. and Weihs, L. and Vanderbilt, E. and Herrasti, A. and Han, W. and Wu, J.J. and Haber, N. and Krishna, R. and Liu, L.J. and Callison Burch, C. and Yatskar, M. and Kembhavi, A. and Clark, C.", TITLE = "Holodeck: Language Guided Generation of 3D Embodied AI Environments", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16277-16287", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226429"} @inproceedings{bb231450, AUTHOR = "Qin, Y. and Zhou, E. and Liu, Q. and Yin, Z.F. and Sheng, L. and Zhang, R.M. and Qiao, Y. and Shao, J.", TITLE = "MP5: A Multi-modal Open-ended Embodied System in Minecraft via Active Perception", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16307-16316", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226430"} @inproceedings{bb231451, AUTHOR = "Zhang, S. and Yu, X.Y. and Song, X.H. and Wang, X.H. and Jiang, S.Q.", TITLE = "Imagine Before Go: Self-Supervised Generative Map for Object Goal Navigation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16414-16425", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226431"} @inproceedings{bb231452, AUTHOR = "Li, H. and Yang, X. and Wang, Z.K. and Zhu, X.Z. and Zhou, J. and Qiao, Y. and Wang, X.G. and Li, H.S. and Lu, L.W. and Dai, J.F.", TITLE = "Auto MC-Reward: Automated Dense Reward Design with Large Language Models for Minecraft", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16426-16435", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226432"} @inproceedings{bb231453, AUTHOR = "Liu, M.X. and Hayes, T.L. and Ricci, E. and Csurka, G. and Volpi, R.", TITLE = "SHiNe: Semantic Hierarchy Nexus for Open-Vocabulary Object Detection", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16634-16644", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226433"} @inproceedings{bb231454, AUTHOR = "Kim, J. and Cho, E. and Kim, S. and Kim, H.W.J.", TITLE = "Retrieval-Augmented Open-Vocabulary Object Detection", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "17427-17436", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226434"} @inproceedings{bb231455, AUTHOR = "Saha, O. and van Horn, G. and Maji, S.", TITLE = "Improved Zero-Shot Classification by Adapting VLMs with Text Descriptions", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "17542-17552", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226435"} @inproceedings{bb231456, AUTHOR = "Toubal, I.E. and Avinash, A. and Alldrin, N.G. and Dlabal, J. and Zhou, W. and Luo, E. and Stretcu, O. and Xiong, H. and Lu, C.T. and Zhou, H. and Krishna, R. and Fuxman, A. and Duerig, T.", TITLE = "Modeling Collaborator: Enabling Subjective Vision Classification with Minimal Human Effort via LLM Tool-Use", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "17553-17563", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226436"} @inproceedings{bb231457, AUTHOR = "Li, X.Q. and Zhang, M.X. and Geng, Y. and Geng, H.R. and Long, Y.X. and Shen, Y. and Zhang, R.R. and Liu, J. and Dong, H.", TITLE = "ManipLLM: Embodied Multimodal Large Language Model for Object-Centric Robotic Manipulation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18061-18070", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226437"} @inproceedings{bb231458, AUTHOR = "Han, T. and Bain, M. and Nagrani, A. and Varol, G. and Xie, W. and Zisserman, A.", TITLE = "AutoAD III: The Prequel: Back to the Pixels", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18164-18174", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226438"} @inproceedings{bb231459, AUTHOR = "Song, E. and Chai, W.H. and Wang, G. and Zhang, Y.C. and Zhou, H.Y. and Wu, F. and Chi, H.Z. and Guo, X. and Ye, T. and Zhang, Y.T. and Lu, Y. and Hwang, J.N. and Wang, G.", TITLE = "MovieChat: From Dense Token to Sparse Memory for Long Video Understanding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18221-18232", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226439"} @inproceedings{bb231460, AUTHOR = "Qu, H.X. and Cai, Y.J. and Liu, J.", TITLE = "LLMs are Good Action Recognizers", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18395-18406", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226440"} @inproceedings{bb231461, AUTHOR = "Chen, J. and Lv, Z.Y. and Wu, S.W. and Lin, K.Q. and Song, C. and Gao, D.F. and Liu, J.W. and Gao, Z.T. and Mao, D.X. and Shou, M.Z.", TITLE = "VideoLLM-online: Online Video Large Language Model for Streaming Video", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18407-18418", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226441"} @inproceedings{bb231462, AUTHOR = "Zhu, A. and Ke, Q.H. and Gong, M.M. and Bailey, J.", TITLE = "Part-Aware Unified Representation of Language and Skeleton for Zero-Shot Action Recognition", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18761-18770", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226442"} @inproceedings{bb231463, AUTHOR = "Chen, T.J. and Yu, H.S. and Yang, Z.G. and Li, Z.C. and Sun, W. and Chen, C.", TITLE = "OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor for General Video Recognition", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18888-18898", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226443"} @inproceedings{bb231464, AUTHOR = "Zhao, Q.H. and Dai, Y. and Li, H. and Hu, W. and Zhang, F. and Liu, J.", TITLE = "LTGC: Long-Tail Recognition via Leveraging LLMs-Driven Generated Content", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "19510-19520", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226444"} @inproceedings{bb231465, AUTHOR = "Siddiqui, Y. and Alliegro, A. and Artemov, A. and Tommasi, T. and Sirigatti, D. and Rosov, V. and Dai, A. and Nießner, M.", TITLE = "MeshGPT: Generating Triangle Meshes with Decoder-Only Transformers", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "19615-19625", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226445"} @inproceedings{bb231466, AUTHOR = "Yuan, Z.H. and Ren, J. and Feng, C.M. and Zhao, H.S. and Cui, S.G. and Li, Z.", TITLE = "Visual Programming for Zero-Shot Open-Vocabulary 3D Visual Grounding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "20623-20633", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226446"} @inproceedings{bb231467, AUTHOR = "Li, Z. and Gao, Z.Y. and Tan, C. and Ren, B. and Yang, L.T. and Li, S.Z.", TITLE = "General Point Model Pretraining with Autoencoding and Autoregressive", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "20954-20964", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226447"} @inproceedings{bb231468, AUTHOR = "Li, K.C. and Wang, Y. and He, Y. and Li, Y.Z. and Wang, Y. and Liu, Y. and Wang, Z. and Xu, J. and Chen, G. and Lou, P. and Wang, L.M. and Qiao, Y.", TITLE = "MVBench: A Comprehensive Multi-modal Video Understanding Benchmark", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "22195-22206", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226448"} @inproceedings{bb231469, AUTHOR = "Taesiri, M.R. and Feng, T.J. and Bezemer, C.P. and Nguyen, A.", TITLE = "GlitchBench: Can Large Multimodal Models Detect Video Game Glitches?", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "22444-22455", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226449"} @inproceedings{bb231470, AUTHOR = "Zhang, R. and Zhang, Y.Z. and Chen, J. and Zhou, Y.F. and Gu, J.X. and Chen, C. and Sun, T.", TITLE = "TRINS: Towards Multimodal Language Models that Can Read", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "22584-22594", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226450"} @inproceedings{bb231471, AUTHOR = "Dunlap, L. and Zhang, Y.H. and Wang, X.H. and Zhong, R.Q. and Darrell, T.J. and Steinhardt, J. and Gonzalez, J.E. and Yeung Levy, S.", TITLE = "Describing Differences in Image Sets with Natural Language", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "24199-24208", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226451"} @inproceedings{bb231472, AUTHOR = "Ishmam, A.M. and Thomas, C.", TITLE = "Semantic Shield: Defending Vision-Language Models Against Backdooring and Poisoning via Fine-Grained Knowledge Alignment", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "24820-24830", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226452"} @inproceedings{bb231473, AUTHOR = "Yang, Y.J. and Zhou, T.Y. and Li, K. and Tao, D.P. and Li, L. and Shen, L. and He, X.D. and Jiang, J. and Shi, Y.H.", TITLE = "Embodied Multi-Modal Agent trained by an LLM from a Parallel TextWorld", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26265-26275", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226453"} @inproceedings{bb231474, AUTHOR = "Hong, Y. and Zheng, Z. and Chen, P.H. and Wang, Y.F. and Li, J. and Gan, C.", TITLE = "MultiPLY: A Multisensory Object-Centric Embodied Large Language Model in 3D World", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26396-26406", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226454"} @inproceedings{bb231475, AUTHOR = "Chen, G. and Shen, L. and Shao, R. and Deng, X. and Nie, L.Q.", TITLE = "LION: Empowering Multimodal Large Language Model with Dual-Level Visual Knowledge", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26530-26540", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226455"} @inproceedings{bb231476, AUTHOR = "Zhang, Y. and Dong, Y.P. and Zhang, S.Y. and Min, T.Z. and Su, H. and Zhu, J.", TITLE = "Exploring the Transferability of Visual Prompting for Multimodal Large Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26552-26562", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226456"} @inproceedings{bb231477, AUTHOR = "Han, J. and Gong, K.X. and Zhang, Y.Y. and Wang, J.Q. and Zhang, K. and Lin, D. and Qiao, Y. and Gao, P. and Yue, X.Y.", TITLE = "OneLLM: One Framework to Align All Modalities with Language", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26574-26585", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226457"} @inproceedings{bb231478, AUTHOR = "Xie, H.X. and Peng, C.J. and Tseng, Y.W. and Chen, H.J. and Hsu, C.F. and Shuai, H.H. and Cheng, W.H.", TITLE = "EmoVIT: Revolutionizing Emotion Insights with Visual Instruction Tuning", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26586-26595", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226458"} @inproceedings{bb231479, AUTHOR = "Wang, X.Y. and Zhuang, B. and Wu, Q.", TITLE = "ModaVerse: Efficiently Transforming Modalities with LLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26596-26606", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226459"} @inproceedings{bb231480, AUTHOR = "Lin, J. and Yin, H.X. and Ping, W. and Molchanov, P. and Shoeybi, M. and Han, S.", TITLE = "VILA: On Pre-training for Visual Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26679-26689", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226460"} @inproceedings{bb231481, AUTHOR = "Lyu, Y.H. and Zheng, X. and Zhou, J.Z. and Wang, L.", TITLE = "UniBind: LLM-Augmented Unified and Balanced Representation Space to Bind Them All", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26742-26752", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226461"} @inproceedings{bb231482, AUTHOR = "Liang, T. and Huang, J. and Kong, M. and Chen, L. and Zhu, Q.", TITLE = "Querying as Prompt: Parameter-Efficient Learning for Multimodal Language Model", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26845-26855", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226462"} @inproceedings{bb231483, AUTHOR = "Jiang, C.Y. and Xu, H.Y. and Dong, M.F. and Chen, J.X. and Ye, W. and Yan, M. and Ye, Q.H. and Zhang, J. and Huang, F. and Zhang, S.K.", TITLE = "Hallucination Augmented Contrastive Learning for Multimodal Large Language Model", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27026-27036", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226463"} @inproceedings{bb231484, AUTHOR = "Zhu, L. and Wei, F. and Lu, Y.", TITLE = "Beyond Text: Frozen Large Language Models in Visual Signal Comprehension", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27037-27047", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226464"} @inproceedings{bb231485, AUTHOR = "Pi, R.J. and Yao, L.W. and Gao, J. and Zhang, J.P. and Zhang, T.", TITLE = "PerceptionGPT: Effectively Fusing Visual Perception Into LLM", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27114-27123", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226465"} @inproceedings{bb231486, AUTHOR = "Tai, Y. and Fan, W.C. and Zhang, Z. and Liu, Z.W.", TITLE = "Link-Context Learning for Multimodal LLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27166-27175", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226466"} @inproceedings{bb231487, AUTHOR = "Tang, Z. and Yang, Z. and Khademi, M. and Liu, Y. and Zhu, C.G. and Bansal, M.", TITLE = "CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any Generation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27415-27424", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226467"} @inproceedings{bb231488, AUTHOR = "Jain, J. and Yang, J.W. and Shi, H.", TITLE = "VCoder: Versatile Vision Encoders for Multimodal Large Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27992-28002", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226468"} @inproceedings{bb231489, AUTHOR = "Yuan, Y.Q. and Li, W. and Liu, J. and Tang, D.Q. and Luo, X.J. and Qin, C. and Zhang, L. and Zhu, J.", TITLE = "Osprey: Pixel Understanding with Visual Instruction Tuning", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "28202-28211", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226469"} @inproceedings{bb231490, AUTHOR = "Zhai, A.J. and Shen, Y. and Chen, E.Y. and Wang, G.X. and Wang, X.L. and Wang, S. and Guan, K.Y. and Wang, S.", TITLE = "Physical Property Understanding from Language-Embedded Feature Fields", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "28296-28305", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226470"} @inproceedings{bb231491, AUTHOR = "Zheng, Z.H. and Wei, J. and Hu, X.F. and Zhu, H.D. and Nevatia, R.", TITLE = "Large Language Models are Good Prompt Learners for Low-Shot Image Classification", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "28453-28462", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226471"} @inproceedings{bb231492, AUTHOR = "He, H.Y. and Pan, Z.Z. and Liu, J. and Cai, J.F. and Zhuang, B.", TITLE = "Efficient Stitchable Task Adaptation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "28555-28565", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226472"} @inproceedings{bb231493, AUTHOR = "Tian, X.Y. and Zou, S. and Yang, Z.Y. and Zhang, J.", TITLE = "ArGue: Attribute-Guided Prompt Tuning for Vision-Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "28578-28587", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226473"} @inproceedings{bb231494, AUTHOR = "Roberts, J. and Luddecke, T. and Sheikh, R. and Han, K. and Albanie, S.", TITLE = "Charting New Territories: Exploring the Geographic and Geospatial Capabilities of Multimodal LLMs", BOOKTITLE = EarthVision24, YEAR = "2024", PAGES = "554-563", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226474"} @inproceedings{bb231495, AUTHOR = "Barbany, O. and Huang, M. and Zhu, X.L. and Dhua, A.", TITLE = "Leveraging Large Language Models for Multimodal Search", BOOKTITLE = FGVC24, YEAR = "2024", PAGES = "1201-1210", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226475"} @inproceedings{bb231496, AUTHOR = "Lv, J.X. and Huang, Y. and Yan, M. and Huang, J.C. and Liu, J.Z. and Liu, Y.F. and Wen, Y.F. and Chen, X.X. and Chen, S.F.", TITLE = "GPT4Motion: Scripting Physical Motions in Text-to-Video Generation via Blender-Oriented GPT Planning", BOOKTITLE = PBDL24, YEAR = "2024", PAGES = "1430-1440", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226476"} @inproceedings{bb231497, AUTHOR = "Baldassini, F.B. and Shukor, M. and Cord, M. and Soulier, L. and Piwowarski, B.", TITLE = "What Makes Multimodal In-Context Learning Work?", BOOKTITLE = Prompting24, YEAR = "2024", PAGES = "1539-1550", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226477"} @inproceedings{bb231498, AUTHOR = "Wang, J.C. and Ke, L.", TITLE = "LLM-Seg: Bridging Image Segmentation and Large Language Model Reasoning", BOOKTITLE = WhatNext24, YEAR = "2024", PAGES = "1765-1774", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226478"} @inproceedings{bb231499, AUTHOR = "Qu, M.X. and Chen, X.D. and Liu, W. and Li, A. and Zhao, Y.", TITLE = "ChatVTG: Video Temporal Grounding via Chat with Video Dialogue Large Language Models", BOOKTITLE = PVUW24, YEAR = "2024", PAGES = "1847-1856", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT226479"}