@inproceedings{bb233800, AUTHOR = "Zhang, H. and Liu, D. and Lv, Z. and Su, B. and Tao, D.C.", TITLE = "Exploring Temporal Concurrency for Video-Language Representation Learning", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15522-15532", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228775"} @inproceedings{bb233801, AUTHOR = "Shukor, M. and Dancette, C. and Cord, M.", TITLE = "eP-ALM: Efficient Perceptual Augmentation of Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "21999-22012", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228776"} @inproceedings{bb233802, AUTHOR = "Schulter, S. and Kumar, B.G.V. and Suh, Y.M. and Dafnis, K.M. and Zhang, Z.X. and Zhao, S.Y. and Metaxas, D.N.", TITLE = "OmniLabel: A Challenging Benchmark for Language-Based Object Detection", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "11919-11928", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228777"} @inproceedings{bb233803, AUTHOR = "Chen, Z.L. and Huang, X. and Guan, Q.L. and Lin, L. and Luo, W.Q.", TITLE = "A Retrospect to Multi-prompt Learning across Vision and Language", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "22133-22144", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228778"} @inproceedings{bb233804, AUTHOR = "Derakhshani, M.M. and Sanchez, E. and Bulat, A. and da Costa, V.G.T. and Snoek, C.G.M. and Tzimiropoulos, G. and Martinez, B.", TITLE = "Bayesian Prompt Learning for Image-Language Model Generalization", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15191-15200", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228779"} @inproceedings{bb233805, AUTHOR = "Cascante Bonilla, P. and Shehada, K. and Smith, J.S. and Doveh, S. and Kim, D.H. and Panda, R. and Varol, G. and Oliva, A. and Ordonez, V. and Feris, R.S. and Karlinsky, L.", TITLE = "Going Beyond Nouns With Vision & Language Models Using Synthetic Data", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "20098-20108", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228780"} @inproceedings{bb233806, AUTHOR = "Upadhyay, U. and Karthik, S. and Mancini, M. and Akata, Z.", TITLE = "ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "1899-1910", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228781"} @inproceedings{bb233807, AUTHOR = "Bitton Guetta, N. and Bitton, Y. and Hessel, J. and Schmidt, L. and Elovici, Y. and Stanovsky, G. and Schwartz, R.", TITLE = "Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of Synthetic and Compositional Images", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2616-2627", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228782"} @inproceedings{bb233808, AUTHOR = "Hu, Z.Y. and Li, Y.Y. and Lyu, M.R. and Wang, L.W.", TITLE = "VL-PET: Vision-and-Language Parameter-Efficient Tuning via Granularity Control", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2998-3008", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228783"} @inproceedings{bb233809, AUTHOR = "Slyman, E. and Kahng, M. and Lee, S.", TITLE = "VLSlice: Interactive Vision-and-Language Slice Discovery", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15245-15255", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228784"} @inproceedings{bb233810, AUTHOR = "Najibi, M. and Ji, J.W. and Zhou, Y. and Qi, C.R. and Yan, X.C. and Ettinger, S. and Anguelov, D.", TITLE = "Unsupervised 3D Perception with 2D Vision-Language Distillation for Autonomous Driving", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "8568-8578", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228785"} @inproceedings{bb233811, AUTHOR = "Xu, H. and Xie, S. and Huang, P.Y. and Yu, L.C. and Howes, R. and Ghosh, G. and Zettlemoyer, L. and Feichtenhofer, C.", TITLE = "CiT: Curation in Training for Effective Vision-Language Data", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15134-15143", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228786"} @inproceedings{bb233812, AUTHOR = "Trager, M. and Perera, P. and Zancato, L. and Achille, A. and Bhatia, P. and Soatto, S.", TITLE = "Linear Spaces of Meanings: Compositional Structures in Vision-Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15349-15358", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228787"} @inproceedings{bb233813, AUTHOR = "Chen, Y.S. and Song, Y.Z. and Yeo, C.Y. and Liu, B. and Fu, J.L. and Shuai, H.H.", TITLE = "SINC: Self-Supervised In-Context Learning for Vision-Language Tasks", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15384-15396", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228788"} @inproceedings{bb233814, AUTHOR = "Wu, C.E. and Tian, Y. and Yu, H.C. and Wang, H. and Morgado, P. and Hu, Y.H. and Yang, L.J.", TITLE = "Why Is Prompt Tuning for Vision-Language Models Robust to Noisy Labels?", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15442-15451", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228789"} @inproceedings{bb233815, AUTHOR = "Ouali, Y. and Bulat, A. and Matinez, B. and Tzimiropoulos, G.", TITLE = "Black Box Few-Shot Adaptation for Vision-Language models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15488-15500", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228790"} @inproceedings{bb233816, AUTHOR = "Kan, B. and Wang, T. and Lu, W.P. and Zhen, X.T. and Guan, W. and Zheng, F.", TITLE = "Knowledge-Aware Prompt Tuning for Generalizable Vision-Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15624-15634", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228791"} @inproceedings{bb233817, AUTHOR = "Zhai, J.T. and Zhang, Q. and Wu, T. and Chen, X.Y. and Liu, J.J. and Cheng, M.M.", TITLE = "SLAN: Self-Locator Aided Network for Vision-Language Understanding", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "21892-21901", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228792"} @inproceedings{bb233818, AUTHOR = "Long, S. and Zhao, Z. and Yuan, J. and Tan, Z.C. and Liu, J.J. and Zhou, L.P. and Wang, S.S. and Wang, J.D.", TITLE = "Task-Oriented Multi-Modal Mutual Learning for Vision-Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "21902-21912", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228793"} @inproceedings{bb233819, AUTHOR = "Cho, E. and Kim, J. and Kim, H.W.J.", TITLE = "Distribution-Aware Prompt Tuning for Vision-Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "21947-21956", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228794"} @inproceedings{bb233820, AUTHOR = "Varma, M. and Delbrouck, J.B. and Hooper, S. and Chaudhari, A. and Langlotz, C.", TITLE = "ViLLA: Fine-Grained Vision-Language Representation Learning from Real-World Data", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "22168-22178", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228795"} @inproceedings{bb233821, AUTHOR = "Zhu, H.G. and Wei, Y.C. and Liang, X.D. and Zhang, C.J. and Zhao, Y.", TITLE = "CTP: Towards Vision-Language Continual Pretraining via Compatible Momentum Contrast and Topology Preservation", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "22200-22210", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228796"} @inproceedings{bb233822, AUTHOR = "Hu, Z.Z. and Zhu, X.L. and Tran, S. and Vidal, R. and Dhua, A.", TITLE = "ProVLA: Compositional Image Search with Progressive Vision-Language Alignment and Multimodal Fusion", BOOKTITLE = CLVL23, YEAR = "2023", PAGES = "2764-2769", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228797"} @inproceedings{bb233823, AUTHOR = "Hall, M. and Gustafson, L. and Adcock, A. and Misra, I. and Ross, C.", TITLE = "Vision-Language Models Performing Zero-Shot Tasks Exhibit Disparities Between Gender Groups", BOOKTITLE = CLVL23, YEAR = "2023", PAGES = "2770-2777", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228798"} @inproceedings{bb233824, AUTHOR = "Agnolucci, L. and Baldrati, A. and Todino, F. and Becattini, F. and Bertini, M. and del Bimbo, A.", TITLE = "ECO: Ensembling Context Optimization for Vision-Language Models", BOOKTITLE = CLVL23, YEAR = "2023", PAGES = "2803-2807", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228799"} @inproceedings{bb233825, AUTHOR = "Palit, V. and Pandey, R. and Arora, A. and Liang, P.P.", TITLE = "Towards Vision-Language Mechanistic Interpretability: A Causal Tracing Tool for BLIP", BOOKTITLE = CLVL23, YEAR = "2023", PAGES = "2848-2853", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228800"} @inproceedings{bb233826, AUTHOR = "Sammani, F. and Deligiannis, N.", TITLE = "Uni-NLX: Unifying Textual Explanations for Vision and Vision-Language Tasks", BOOKTITLE = VLAR23, YEAR = "2023", PAGES = "4636-4641", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228801"} @inproceedings{bb233827, AUTHOR = "Lee, D.J. and Song, S. and Suh, J. and Choi, J. and Lee, S. and Kim, H.W.J.", TITLE = "Read-only Prompt Optimization for Vision-Language Few-shot Learning", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "1401-1411", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228802"} @inproceedings{bb233828, AUTHOR = "Li, X. and Fang, Y.H. and Liu, M.H. and Ling, Z. and Tu, Z.W. and Su, H.", TITLE = "Distilling Large Vision-Language Model with Out-of-Distribution Generalizability", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2492-2503", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228803"} @inproceedings{bb233829, AUTHOR = "Li, J.C. and Gao, M. and Wei, L.H. and Tang, S.L. and Zhang, W.Q. and Li, M.Z. and Ji, W. and Tian, Q. and Chua, T.S. and Zhuang, Y.T.", TITLE = "Gradient-Regulated Meta-Prompt Learning for Generalizable Vision-Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2551-2562", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228804"} @inproceedings{bb233830, AUTHOR = "Bi, J.Y. and Cheng, D. and Yao, P. and Pang, B. and Zhan, Y.F. and Yang, C.G. and Wang, Y.J. and Sun, H. and Deng, W.W. and Zhang, Q.", TITLE = "VL-Match: Enhancing Vision-Language Pretraining with Token-Level and Instance-Level Matching", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2584-2593", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228805"} @inproceedings{bb233831, AUTHOR = "Udandarao, V. and Gupta, A. and Albanie, S.", TITLE = "SuS-X: Training-Free Name-Only Transfer of Vision-Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2725-2736", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228806"} @inproceedings{bb233832, AUTHOR = "Jiang, C.Y. and Xu, H.Y. and Ye, W. and Ye, Q.H. and Li, C.L. and Yan, M. and Bi, B. and Zhang, S.K. and Huang, F. and Huang, S.F.", TITLE = "BUS: Efficient and Effective Vision-language Pre-training with Bottom-Up Patch Summarization", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2888-2898", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228807"} @inproceedings{bb233833, AUTHOR = "Shi, C. and Yang, S.", TITLE = "LoGoPrompt: Synthetic Text Images Can Be Good Visual Prompts for Vision-Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2920-2929", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228808"} @inproceedings{bb233834, AUTHOR = "Wang, A.J.P. and Lin, K.Q. and Zhang, D.J.H. and Lei, S.W.X. and Shou, M.Z.", TITLE = "Too Large; Data Reduction for Vision-Language Pre-Training", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "3124-3134", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228809"} @inproceedings{bb233835, AUTHOR = "Wang, W.H. and Yang, Z. and Xu, B. and Li, J. and Sun, Y.K.", TITLE = "ViLTA: Enhancing Vision-Language Pre-training through Textual Augmentation", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "3135-3146", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228810"} @inproceedings{bb233836, AUTHOR = "Boecking, B. and Usuyama, N. and Bannur, S. and Castro, D.C. and Schwaighofer, A. and Hyland, S. and Wetscherek, M. and Naumann, T. and Nori, A. and Alvarez Valle, J. and Poon, H. and Oktay, O.", TITLE = "Making the Most of Text Semantics to Improve Biomedical Vision-Language Processing", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:1-21", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228811"} @inproceedings{bb233837, AUTHOR = "Cui, Q. and Zhou, B. and Guo, Y. and Yin, W.D. and Wu, H. and Yoshie, O. and Chen, Y.", TITLE = "Contrastive Vision-Language Pre-training with Limited Resources", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:236-253", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228812"} @inproceedings{bb233838, AUTHOR = "Walmer, M. and Sikka, K. and Sur, I. and Shrivastava, A. and Jha, S.", TITLE = "Dual-Key Multimodal Backdoors for Visual Question Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "15354-15364", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228813"} @inproceedings{bb233839, AUTHOR = "Ding, Y. and Yu, J. and Liu, B. and Hu, Y. and Cui, M.X. and Wu, Q.", TITLE = "MuKEA: Multimodal Knowledge Extraction and Accumulation for Knowledge-based Visual Question Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "5079-5088", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228814"} @inproceedings{bb233840, AUTHOR = "Gao, F. and Ping, Q. and Thattai, G. and Reganti, A. and Wu, Y.N. and Natarajan, P.", TITLE = "Transform-Retrieve-Generate: Natural Language-Centric Outside-Knowledge Visual Question Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "5057-5067", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228815"} @inproceedings{bb233841, AUTHOR = "Aflalo, E. and Du, M. and Tseng, S.Y. and Liu, Y.F. and Wu, C. and Duan, N. and Lal, V.", TITLE = "VL-InterpreT: An Interactive Visualization Tool for Interpreting Vision-Language Transformers", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "21374-21383", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228816"} @inproceedings{bb233842, AUTHOR = "Hu, X.W. and Gan, Z. and Wang, J.F. and Yang, Z.Y. and Liu, Z.C. and Lu, Y. and Wang, L.J.", TITLE = "Scaling Up Vision-Language Pretraining for Image Captioning", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "17959-17968", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228817"} @inproceedings{bb233843, AUTHOR = "Zhang, P.C. and Li, X.J. and Hu, X.W. and Yang, J.W. and Zhang, L. and Wang, L.J. and Choi, Y.J. and Gao, J.F.", TITLE = "VinVL: Revisiting Visual Representations in Vision-Language Models", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "5575-5584", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228818"} @inproceedings{bb233844, AUTHOR = "Li, Z.W. and Stengel Eskin, E. and Zhang, Y.X. and Xie, C. and Tran, Q. and van Durme, B. and Yuille, A.L.", TITLE = "Calibrating Concepts and Operations: Towards Symbolic Reasoning on Real Images", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "14890-14899", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228819"} @inproceedings{bb233845, AUTHOR = "Yang, X. and Zhang, H.W. and Qi, G.J. and Cai, J.F.", TITLE = "Causal Attention for Vision-Language Tasks", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "9842-9852", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228820"} @inproceedings{bb233846, AUTHOR = "Stefanini, M. and Cornia, M. and Baraldi, L. and Cucchiara, R.", TITLE = "A Novel Attention-based Aggregation Function to Combine Vision and Language", BOOKTITLE = ICPR21, YEAR = "2021", PAGES = "1212-1219", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228821"} @inproceedings{bb233847, AUTHOR = "Jain, V. and Lodhavia, J.", TITLE = "Automatic Question Tagging using k-Nearest Neighbors and Random Forest", BOOKTITLE = ISCV20, YEAR = "2020", PAGES = "1-4", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228822"} @inproceedings{bb233848, AUTHOR = "Zheng, W.B. and Yan, L. and Gou, C. and Wang, F.Y.", TITLE = "Webly Supervised Knowledge Embedding Model for Visual Reasoning", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "12442-12451", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228823"} @inproceedings{bb233849, AUTHOR = "Nguyen, D.K. and Okatani, T.", TITLE = "Multi-Task Learning of Hierarchical Vision-Language Representation", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "10484-10493", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228824"} @inproceedings{bb233850, AUTHOR = "Gupta, T. and Shih, K.J. and Singh, S. and Hoiem, D.", TITLE = "Aligned Image-Word Representations Improve Inductive Transfer Across Vision-Language Tasks", BOOKTITLE = ICCV17, YEAR = "2017", PAGES = "4223-4232", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228825"} @inproceedings{bb233851, AUTHOR = "Yu, T.Y. and Zhang, H. and Li, Q.M. and Xu, Q.X. and Yao, Y. and Chen, D. and Lu, X.M. and Cui, G. and Dang, Y.K. and He, T. and Feng, X.C. and Song, J. and Zheng, B. and Liu, Z.Y. and Chua, T.S. and Sun, M.S.", TITLE = "RLAIF-V: Open-Source AI Feedback Leads to Super GPT-4V Trustworthiness", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19985-19995", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228826"} @inproceedings{bb233852, AUTHOR = "Liang, J. and Huang, W.K. and Wan, G.C. and Yang, Q. and Ye, M.", TITLE = "LoRASculpt: Sculpting LoRA for Harmonizing General and Specialized Knowledge in Multimodal Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "26170-26180", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228827"} @inproceedings{bb233853, AUTHOR = "Cao, Y. and Xing, Y. and Zhang, J. and Lin, D. and Zhang, T.W. and Tsang, I. and Liu, Y. and Guo, Q.", TITLE = "SceneTAP: Scene-Coherent Typographic Adversarial Planner against Vision-Language Models in Real-World Environments", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "25050-25059", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228828"} @inproceedings{bb233854, AUTHOR = "Wang, Y.B. and Guan, J. and Liang, J. and He, R.", TITLE = "Do We Really Need Curated Malicious Data for Safety Alignment in Multi-modal Large Language Models?", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19879-19889", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228829"} @inproceedings{bb233855, AUTHOR = "Peng, R. and He, H.Y. and Wei, Y. and Wen, Y.D. and Hu, D.", TITLE = "Matters: Training-free Fine-grained Image Caption Enhancement via Local Perception", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "3963-3973", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228830"} @inproceedings{bb233856, AUTHOR = "Yang, Z. and Luo, X. and Han, D.Q. and Xu, Y.J. and Li, D.S.", TITLE = "Mitigating Hallucinations in Large Vision-Language Models via DPO: On-Policy Data Hold the Key", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "10610-10620", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228831"} @inproceedings{bb233857, AUTHOR = "Bae, K. and Kim, J. and Lee, S. and Lee, S. and Lee, G. and Choi, J.", TITLE = "MASH-VLM: Mitigating Action-Scene Hallucination in Video-LLMs through Disentangled Spatial-Temporal Representations", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "13744-13753", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228832"} @inproceedings{bb233858, AUTHOR = "Yin, H. and Si, G.Z. and Wang, Z.", TITLE = "ClearSight: Visual Signal Enhancement for Object Hallucination Mitigation in Multimodal Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14625-14634", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228833"} @inproceedings{bb233859, AUTHOR = "Yang, L. and Zheng, Z.W. and Chen, B. and Zhao, Z.Y. and Lin, C.H. and Shen, C.", TITLE = "Nullu: Mitigating Object Hallucinations in Large Vision-Language Models via HalluSpace Projection", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14635-14645", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228834"} @inproceedings{bb233860, AUTHOR = "Wu, Y.C. and Zhang, L. and Yao, H. and Du, J.L. and Yan, K. and Ding, S.H. and Wu, Y.S. and Li, X.Q.", TITLE = "Antidote: A Unified Framework for Mitigating LVLM Hallucinations in Counterfactual Presupposition and Object Perception", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14646-14656", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228835"} @inproceedings{bb233861, AUTHOR = "Tu, Y. and Hu, R. and Sang, J.", TITLE = "ODE: Open-Set Evaluation of Hallucinations in Multimodal Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19836-19845", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228836"} @inproceedings{bb233862, AUTHOR = "Liu, J.Z. and Fu, Y.H. and Xie, R. and Xie, R. and Sun, X. and Lian, F.Z. and Kang, Z. and Li, X.R.", TITLE = "PhD: A ChatGPT-Prompted Visual hallucination Evaluation Dataset", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19857-19866", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228837"} @inproceedings{bb233863, AUTHOR = "Jiang, Z.Q. and Chen, J.K. and Zhu, B. and Luo, T.J. and Shen, Y.K. and Yang, X.", TITLE = "Devils in Middle Layers of Large Vision-Language Models: Interpreting, Detecting and Mitigating Object Hallucinations via Attention Lens", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "25004-25014", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228838"} @inproceedings{bb233864, AUTHOR = "Park, E. and Kim, M. and Kim, G.", TITLE = "HalLoc: Token-level Localization of Hallucinations for Vision Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29893-29903", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228839"} @inproceedings{bb233865, AUTHOR = "Suo, W. and Zhang, L.J. and Sun, M.Y. and Wu, L.Y.B. and Wang, P. and Zhang, Y.N.", TITLE = "Octopus: Alleviating Hallucination via Dynamic Contrastive Decoding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29904-29914", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228840"} @inproceedings{bb233866, AUTHOR = "An, W.B. and Tian, F. and Leng, S. and Nie, J.H. and Lin, H. and Wang, Q.Y. and Chen, P. and Zhang, X.Q. and Lu, S.J.", TITLE = "Mitigating Object Hallucinations in Large Vision-Language Models with Assembly of Global and Local Attention", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29915-29926", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228841"} @inproceedings{bb233867, AUTHOR = "Zhuang, X.W. and Zhu, Z.H. and Xie, Y.X. and Liang, L.M. and Zou, Y.X.", TITLE = "VASparse: Towards Efficient Visual Hallucination Mitigation via Visual-Aware Token Sparsification", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "4189-4199", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228842"} @inproceedings{bb233868, AUTHOR = "Basak, D. and Bhatt, S. and Kanduri, S. and Desarkar, M.S.", TITLE = "Aerial Mirage: Unmasking Hallucinations in Large Vision Language Models", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "5500-5508", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228843"} @inproceedings{bb233869, AUTHOR = "Tang, F.L. and Liu, C.Z. and Xu, Z.X. and Hu, M. and Huang, Z. and Xue, H.C. and Chen, Z.Y. and Peng, Z.L. and Yang, Z.W. and Zhou, S.J. and Li, W.X. and Li, Y.L. and Song, W.X. and Su, S.Y. and Feng, W. and Su, J. and Lin, M. and Peng, Y.F. and Cheng, X.L. and Razzak, I. and Ge, Z.Y.", TITLE = "Seeing Far and Clearly: Mitigating Hallucinations in MLLMs with Attention Causal Decoding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "26147-26159", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228844"} @inproceedings{bb233870, AUTHOR = "Yang, J.N. and Chen, X. and Madaan, N. and Iyengar, M. and Qian, S. and Fouhey, D.F. and Chai, J.", TITLE = "3D-GRAND: A Million-Scale Dataset for 3D-LLMs with Better Grounding and Less Hallucination", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29501-29512", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228845"} @inproceedings{bb233871, AUTHOR = "Yoon, D. and Song, Y. and Park, W.", TITLE = "Stop learning it all to mitigate visual hallucination, Focus on the hallucination target", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "4200-4208", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228846"} @inproceedings{bb233872, AUTHOR = "Chen, J.Z. and Zhang, T.S. and Huang, S.Y. and Niu, Y.W. and Zhang, L.F. and Wen, L.J. and Hu, X.M.", TITLE = "ICT: Image-Object Cross-Level Trusted Intervention for Mitigating Object Hallucination in Large Vision-Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "4209-4221", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228847"} @inproceedings{bb233873, AUTHOR = "Kim, B. and Shin, W. and Lee, K. and Jung, Y. and Seo, S.", TITLE = "Make VLM Recognize Visual Hallucination on Cartoon Character Image with Pose Information", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "5398-5407", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228848"} @inproceedings{bb233874, AUTHOR = "Huang, P.H. and Li, J.L. and Chen, C.P. and Chang, M.C. and Chen, W.C.", TITLE = "Who Brings the Frisbee: Probing Hidden Hallucination Factors in Large Vision-Language Model via Causality Analysis", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "6125-6135", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228849"} @inproceedings{bb233875, AUTHOR = "Liu, S. and Zheng, K. and Chen, W.", TITLE = "Paying More Attention to Image: A Training-free Method for Alleviating Hallucination in LVLMS", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXXXIII: 125-140", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228850"} @inproceedings{bb233876, AUTHOR = "Zhang, J. and Wang, T. and Zhang, H.G. and Lu, P. and Zheng, F.", TITLE = "Reflective Instruction Tuning: Mitigating Hallucinations in Large Vision-language Models", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXVIII: 196-213", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228851"} @inproceedings{bb233877, AUTHOR = "Kaul, P. and Li, Z.Z. and Yang, H. and Dukler, Y. and Swaminathan, A. and Taylor, C.J. and Soatto, S.", TITLE = "THRONE: An Object-Based Hallucination Benchmark for the Free-Form Generations of Large Vision-Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27218-27228", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228852"} @inproceedings{bb233878, AUTHOR = "Jiang, C.Y. and Xu, H.Y. and Dong, M.F. and Chen, J.X. and Ye, W. and Yan, M. and Ye, Q.H. and Zhang, J. and Huang, F. and Zhang, S.K.", TITLE = "Hallucination Augmented Contrastive Learning for Multimodal Large Language Model", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27026-27036", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228853"} @inproceedings{bb233879, AUTHOR = "Huang, Q.D. and Dong, X.Y. and Zhang, P. and Wang, B. and He, C.H. and Wang, J.Q. and Lin, D. and Zhang, W.M. and Yu, N.H.", TITLE = "OPERA: Alleviating Hallucination in Multi-Modal Large Language Models via Over-Trust Penalty and Retrospection-Allocation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13418-13427", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228854"} @inproceedings{bb233880, AUTHOR = "Yu, Q.F. and Li, J.C. and Wei, L.H. and Pang, L. and Ye, W.T. and Qin, B.S. and Tang, S.L. and Tian, Q. and Zhuang, Y.T.", TITLE = "HalluciDoctor: Mitigating Hallucinatory Toxicity in Visual Instruction Data", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "12944-12953", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228855"} @inproceedings{bb233881, AUTHOR = "Favero, A. and Zancato, L. and Trager, M. and Choudhary, S. and Perera, P. and Achille, A. and Swaminathan, A. and Soatto, S.", TITLE = "Multi-Modal Hallucination Control by Visual Information Grounding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14303-14312", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228856"} @inproceedings{bb233882, AUTHOR = "Ouali, Y. and Bulat, A. and Martinez, B. and Tzimiropoulos, G.", TITLE = "CLIP-DPO: Vision-language Models as a Source of Preference for Fixing Hallucinations in LVLMS", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXXVI: 395-413", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228857"} @inproceedings{bb233883, AUTHOR = "Ye Bin, M. and Hyeon Woo, N. and Choi, W. and Oh, T.H.", TITLE = "Beaf: Observing Before-after Changes to Evaluate Hallucination in Vision-language Models", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "XI: 232-248", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228858"} @inproceedings{bb233884, AUTHOR = "Kim, M. and Kim, M. and Bae, J. and Choi, S. and Kim, S. and Chang, B.", TITLE = "Exploiting Semantic Reconstruction to Mitigate Hallucinations in Vision-language Models", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXXXVI: 236-252", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228859"} @inproceedings{bb233885, AUTHOR = "Guan, T.R. and Liu, F. and Wu, X. and Xian, R.Q. and Li, Z.X. and Liu, X.Y. and Wang, X. and Chen, L. and Huang, F. and Yacoob, Y. and Manocha, D. and Zhou, T.Y.", TITLE = "Hallusionbench: An Advanced Diagnostic Suite for Entangled Language Hallucination and Visual Illusion in Large Vision-Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14375-14385", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228860"} @inproceedings{bb233886, AUTHOR = "Leng, S. and Zhang, H. and Chen, G.Z. and Li, X. and Lu, S.J. and Miao, C.Y. and Bing, L.", TITLE = "Mitigating Object Hallucinations in Large Vision-Language Models through Visual Contrastive Decoding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13872-13882", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228861"} @inproceedings{bb233887, AUTHOR = "Wang, Z. and Bingham, G. and Yu, A.W. and Le, Q.V. and Luong, T. and Ghiasi, G.", TITLE = "Haloquest: A Visual Hallucination Dataset for Advancing Multimodal Reasoning", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXXVII: 288-304", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228862"} @inproceedings{bb233888, AUTHOR = "Wang, T.J.J. and Laaksonen, J. and Langer, T. and Arponen, H. and Bishop, T.E.", TITLE = "Learning by Hallucinating: Vision-Language Pre-training with Weak Supervision", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "1073-1083", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT228863"} @article{bb233889, AUTHOR = "Wang, Y.Z. and Hu, W.B. and Dong, Y.P. and Liu, J. and Zhang, H.W. and Hong, R.C.", TITLE = "Align Is Not Enough: Multimodal Universal Jailbreak Attack Against Multimodal Large Language Models", JOURNAL = CirSysVideo, VOLUME = "35", YEAR = "2025", NUMBER = "6", MONTH = "June", PAGES = "5475-5488", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT228864"} @inproceedings{bb233890, AUTHOR = "Hossain, M.Z. and Imteaj, A.", TITLE = "SLADE: Shielding against Dual Exploits in Large Vision-Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24244-24254", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT228865"} @inproceedings{bb233891, AUTHOR = "Jeong, J. and Bae, S. and Jung, Y. and Hwang, J. and Yang, E.", TITLE = "Playing the Fool: Jailbreaking LLMs and Multimodal LLMs with Out-of-Distribution Strategy", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29937-29946", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT228866"} @inproceedings{bb233892, AUTHOR = "Yang, Z.P. and Fan, J. and Yan, A. and Gao, E. and Lin, X. and Li, T. and Mo, K. and Dong, C.", TITLE = "Distraction is All You Need for Multimodal Large Language Model Jailbreaking", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "9467-9476", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT228867"} @inproceedings{bb233893, AUTHOR = "Hao, S.Y. and Hooi, B. and Liu, J. and Chang, K.W. and Huang, Z. and Cai, Y.J.", TITLE = "Exploring Visual Vulnerabilities via Multi-Loss Adversarial Search for Jailbreaking Vision-Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19890-19899", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT228868"} @inproceedings{bb233894, AUTHOR = "Wang, H. and Wang, G. and Zhang, H.", TITLE = "Steering Away from Harm: An Adaptive Approach to Defending Vision Language Model Against Jailbreaks", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29947-29957", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT228869"} @inproceedings{bb233895, AUTHOR = "Ghosal, S.S. and Chakraborty, S. and Singh, V. and Guan, T.R. and Wang, M. and Beirami, A. and Huang, F. and Velasquez, A. and Manocha, D. and Bedi, A.S.", TITLE = "Immune: Improving Safety Against Jailbreaks in Multi-modal LLMs via Inference-Time Alignment", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "25038-25049", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT228870"} @inproceedings{bb233896, AUTHOR = "Xiang, Y.L. and Hong, Z.M. and Yao, L. and Wang, D.D. and Liu, T.L.", TITLE = "Jailbreaking the Non-Transferable Barrier via Test-Time Data Disguising", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "30671-30681", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT228871"} @inproceedings{bb233897, AUTHOR = "Chen, J.X. and Dong, J.H. and Xie, X.H.", TITLE = "Mind the Trojan Horse: Image Prompt Adapter Enabling Scalable and Deceptive Jailbreaking", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "23785-23794", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT228872"} @inproceedings{bb233898, AUTHOR = "Li, Y.F. and Guo, H. and Zhou, K. and Zhao, W.X. and Wen, J.R.", TITLE = "Images are Achilles' Heel of Alignment: Exploiting Visual Vulnerabilities for Jailbreaking Multimodal Large Language Models", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXXIII: 174-189", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT228873"} @article{bb233899, AUTHOR = "Wu, Y.C. and Yang, J.C.", TITLE = "A Robust Passage Retrieval Algorithm for Video Question Answering", JOURNAL = CirSysVideo, VOLUME = "18", YEAR = "2008", NUMBER = "10", MONTH = "October", PAGES = "1411-1421", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT228874"}