@inproceedings{bb235100,
AUTHOR = "Teney, D. and Anderson, P. and He, X. and van den Hengel, A.J.",
TITLE = "Tips and Tricks for Visual Question Answering:
Learnings from the 2017 Challenge",
BOOKTITLE = CVPR18,
YEAR = "2018",
PAGES = "4223-4232",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230069"}
@inproceedings{bb235101,
AUTHOR = "Bai, Y.L. and Fu, J.L. and Zhao, T.J. and Mei, T.",
TITLE = "Deep Attention Neural Tensor Network for Visual Question Answering",
BOOKTITLE = ECCV18,
YEAR = "2018",
PAGES = "XII: 21-37",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230070"}
@inproceedings{bb235102,
AUTHOR = "Sinha, A. and Ayush, K.",
TITLE = "Towards Mathematical Reasoning: A Multimodal Deep Learning Approach",
BOOKTITLE = ICIP18,
YEAR = "2018",
PAGES = "4028-4032",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230071"}
@inproceedings{bb235103,
AUTHOR = "Rosso Mateus, A. and Gonzalez, F.A. and Montes y Gomez, M.",
TITLE = "A Two-Step Neural Network Approach to Passage Retrieval for Open Domain
Question Answering",
BOOKTITLE = CIARP17,
YEAR = "2017",
PAGES = "566-574",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230072"}
@inproceedings{bb235104,
AUTHOR = "Zhu, C. and Zhao, Y. and Huang, S. and Tu, K. and Ma, Y.",
TITLE = "Structured Attentions for Visual Question Answering",
BOOKTITLE = ICCV17,
YEAR = "2017",
PAGES = "1300-1309",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230073"}
@inproceedings{bb235105,
AUTHOR = "Hu, R. and Andreas, J. and Rohrbach, M. and Darrell, T.J. and Saenko, K.",
TITLE = "Learning to Reason:
End-to-End Module Networks for Visual Question Answering",
BOOKTITLE = ICCV17,
YEAR = "2017",
PAGES = "804-813",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230074"}
@inproceedings{bb235106,
AUTHOR = "Peris, A. and Casacuberta, F.",
TITLE = "Interactive-Predictive Neural Multimodal Systems",
BOOKTITLE = IbPRIA19,
YEAR = "2019",
PAGES = "I:16-28",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230075"}
@inproceedings{bb235107,
AUTHOR = "Bolanos, M. and Peris, A. and Casacuberta, F. and Radeva, P.",
TITLE = "VIBIKNet: Visual Bidirectional Kernelized Network for Visual Question
Answering",
BOOKTITLE = IbPRIA17,
YEAR = "2017",
PAGES = "372-380",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230076"}
@inproceedings{bb235108,
AUTHOR = "Kafle, K. and Kanan, C.",
TITLE = "An Analysis of Visual Question Answering Algorithms",
BOOKTITLE = ICCV17,
YEAR = "2017",
PAGES = "1983-1991",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230077"}
@inproceedings{bb235109,
AUTHOR = "Kafle, K. and Kanan, C.",
TITLE = "Answer-Type Prediction for Visual Question Answering",
BOOKTITLE = CVPR16,
YEAR = "2016",
PAGES = "4976-4984",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230078"}
@inproceedings{bb235110,
AUTHOR = "Wang, P. and Wu, Q. and Shen, C. and van den Hengel, A.J.",
TITLE = "The VQA-Machine: Learning How to Use Existing Vision Algorithms to
Answer New Questions",
BOOKTITLE = CVPR17,
YEAR = "2017",
PAGES = "3909-3918",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230079"}
@inproceedings{bb235111,
AUTHOR = "Yu, D. and Fu, J. and Mei, T. and Rui, Y.",
TITLE = "Multi-level Attention Networks for Visual Question Answering",
BOOKTITLE = CVPR17,
YEAR = "2017",
PAGES = "4187-4195",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230080"}
@inproceedings{bb235112,
AUTHOR = "Ramakrishnan, S.K. and Pal, A. and Sharma, G. and Mittal, A.",
TITLE = "An Empirical Evaluation of Visual Question Answering for Novel
Objects",
BOOKTITLE = CVPR17,
YEAR = "2017",
PAGES = "7312-7321",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT230081"}
@article{bb235113,
AUTHOR = "Gouthaman, K.V. and Nambiar, A. and Srinivas, K.S. and Mittal, A.",
TITLE = "Linguistically-aware attention for reducing the semantic gap in
vision-language tasks",
JOURNAL = PR,
VOLUME = "112",
YEAR = "2021",
PAGES = "107812",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230082"}
@article{bb235114,
AUTHOR = "Zhou, K.Y. and Yang, J.K. and Loy, C.C. and Liu, Z.W.",
TITLE = "Learning to Prompt for Vision-Language Models",
JOURNAL = IJCV,
VOLUME = "130",
YEAR = "2022",
NUMBER = "9",
MONTH = "September",
PAGES = "2337-2348",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230083"}
@inproceedings{bb235115,
AUTHOR = "Zhou, K.Y. and Yang, J.K. and Loy, C.C. and Liu, Z.W.",
TITLE = "Conditional Prompt Learning for Vision-Language Models",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "16795-16804",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230084"}
@article{bb235116,
AUTHOR = "Ma, C.C. and Liu, Y. and Deng, J.K. and Xie, L.X. and Dong, W.M. and Xu, C.S.",
TITLE = "Understanding and Mitigating Overfitting in Prompt Tuning for
Vision-Language Models",
JOURNAL = CirSysVideo,
VOLUME = "33",
YEAR = "2023",
NUMBER = "9",
MONTH = "September",
PAGES = "4616-4629",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230085"}
@article{bb235117,
AUTHOR = "Chen, C.Q. and Han, D.Z. and Chang, C.C.",
TITLE = "MPCCT: Multimodal vision-language learning paradigm with
context-based compact Transformer",
JOURNAL = PR,
VOLUME = "147",
YEAR = "2024",
PAGES = "110084",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230086"}
@article{bb235118,
AUTHOR = "Yu, Z.T. and Zhao, J. and Guo, C.L. and Yang, Y.",
TITLE = "StableNet: Distinguishing the hard samples to overcome language
priors in visual question answering",
JOURNAL = IET-CV,
VOLUME = "18",
YEAR = "2024",
NUMBER = "2",
PAGES = "315-327",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230087"}
@article{bb235119,
AUTHOR = "Bazi, Y. and Bashmal, L. and Rahhal, M.M.A. and Ricci, R. and Melgani, F.",
TITLE = "RS-LLaVA: A Large Vision-Language Model for Joint Captioning and
Question Answering in Remote Sensing Imagery",
JOURNAL = RS,
VOLUME = "16",
YEAR = "2024",
NUMBER = "9",
PAGES = "1477",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230088"}
@article{bb235120,
AUTHOR = "Tan, Y.T. and Chen, Y.Y. and Wang, J.Q.",
TITLE = "DSTA: Reinforcing Vision-Language Understanding for Scene-Text VQA
With Dual-Stream Training Approach",
JOURNAL = SPLetters,
VOLUME = "32",
YEAR = "2025",
PAGES = "6-10",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230089"}
@article{bb235121,
AUTHOR = "Alsabbagh, A.R. and Mansour, T. and Al Kharabsheh, M. and Ebdah, A.S. and Al Emaryeen, R. and Al Nahhas, S. and Mahafza, W. and Al Kadi, O.",
TITLE = "MiniMedGPT: Efficient Large Vision-Language Model for medical Visual
Question Answering",
JOURNAL = PRL,
VOLUME = "189",
YEAR = "2025",
PAGES = "8-16",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230090"}
@article{bb235122,
AUTHOR = "Wang, X. and Wu, J.L. and Lin, Z. and Zhang, F.Z. and Zhang, D. and Nie, L.Q.",
TITLE = "Video DataFlywheel: Resolving the Impossible Data Trinity in
Video-Language Understanding",
JOURNAL = PAMI,
VOLUME = "47",
YEAR = "2025",
NUMBER = "4",
MONTH = "April",
PAGES = "2912-2923",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230091"}
@article{bb235123,
AUTHOR = "Shen, R. and Inoue, N. and Guan, D. and Cai, R. and Kot, A.C. and Shinoda, K.",
TITLE = "ContextualCoder: Adaptive In-Context Prompting for Programmatic
Visual Question Answering",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "4936-4949",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230092"}
@inproceedings{bb235124,
AUTHOR = "Shen, R. and Inoue, N. and Shinoda, K.",
TITLE = "Pyramid Coder: Hierarchical Code Generator for Compositional Visual
Question Answering",
BOOKTITLE = ICIP24,
YEAR = "2024",
PAGES = "430-436",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230093"}
@inproceedings{bb235125,
AUTHOR = "Panagopoulou, A. and Zhou, H.L. and Savarese, S. and Xiong, C.M. and Callison Burch, C. and Yatskar, M. and Niebles, J.C.",
TITLE = "ViUniT: Visual Unit Tests for More Robust Visual Programming",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "24646-24656",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230094"}
@inproceedings{bb235126,
AUTHOR = "Wang, W.Z. and Duan, C. and Peng, Z.H. and Liu, Y.X. and Zhou, B.",
TITLE = "Embodied Scene Understanding for Vision Language Models via MetaVQA",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "22453-22464",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230095"}
@inproceedings{bb235127,
AUTHOR = "Tian, X.Y. and Zou, S. and Yang, Z.Y. and Zhang, J.",
TITLE = "Identifying and Mitigating Position Bias of Multi-image
Vision-Language Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "10599-10609",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230096"}
@inproceedings{bb235128,
AUTHOR = "Sheng, L.J. and Liang, J. and Wang, Z. and He, R.",
TITLE = "R-TPT: Improving Adversarial Robustness of Vision-Language Models
through Test-Time Prompt Tuning",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "29958-29967",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230097"}
@inproceedings{bb235129,
AUTHOR = "Das, D. and Talon, D. and Mancini, M. and Wang, Y.M. and Ricci, E.",
TITLE = "One VLM to Keep it Learning: Generation and Balancing for Data-free
Continual Visual Question Answering",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "5635-5645",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230098"}
@inproceedings{bb235130,
AUTHOR = "Ishmam, M.F. and Tashdeed, I. and Saadat, T.A. and Ashmafee, M.H. and Kamal, A.R.M. and Hossain, M.A.",
TITLE = "Visual Robustness Benchmark for Visual Question Answering (VQA)",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "6623-6633",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230099"}
@inproceedings{bb235131,
AUTHOR = "Chen, X. and Djolonga, J. and Padlewski, P. and Mustafa, B. and Changpinyo, S. and Wu, J.L. and Ruiz, C.R. and Goodman, S. and Wang, X. and Tay, Y. and Shakeri, S. and Dehghani, M. and Salz, D. and Lucic, M. and Tschannen, M. and Nagrani, A. and Hu, H. and Joshi, M. and Pang, B. and Montgomery, C. and Pietrzyk, P. and Ritter, M. and Piergiovanni, A. and Minderer, M. and Pavetic, F. and Waters, A. and Li, G. and Alabdulmohsin, I. and Beyer, L. and Amelot, J. and Lee, K. and Steiner, A.P. and Li, Y. and Keysers, D. and Arnab, A. and Xu, Y.Z. and Rong, K. and Kolesnikov, A. and Seyedhosseini, M. and Angelova, A. and Zhai, X.H. and Houlsby, N. and Soricut, R.",
TITLE = "On Scaling Up a Multilingual Vision and Language Model",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14432-14444",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230100"}
@inproceedings{bb235132,
AUTHOR = "Li, R.J. and Wu, Y. and He, X.M.",
TITLE = "Learning by Correction: Efficient Tuning Task for Zero-Shot
Generative Vision-Language Reasoning",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13428-13437",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230101"}
@inproceedings{bb235133,
AUTHOR = "Khan, Z. and Fu, Y.",
TITLE = "Consistency and Uncertainty: Identifying Unreliable Responses From
Black-Box Vision-Language Models for Selective Visual Question
Answering",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "10854-10863",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230102"}
@inproceedings{bb235134,
AUTHOR = "Gu, T.C. and Yang, K.C. and Liu, D. and Cai, W.D.",
TITLE = "LaPA: Latent Prompt Assist Model for Medical Visual Question
Answering",
BOOKTITLE = DEF-AI-MIA24,
YEAR = "2024",
PAGES = "4971-4980",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230103"}
@inproceedings{bb235135,
AUTHOR = "Feinglass, J. and Yang, Y.Z.",
TITLE = "Towards Addressing the Misalignment of Object Proposal Evaluation for
Vision-Language Tasks via Semantic Grounding",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "4385-4395",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230104"}
@inproceedings{bb235136,
AUTHOR = "Nadeem, A. and Hilton, A. and Dawes, R. and Thomas, G. and Mustafa, A.",
TITLE = "CAD: Contextual Multi-modal Alignment for Dynamic AVQA",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "7236-7248",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230105"}
@inproceedings{bb235137,
AUTHOR = "Wu, W. and Li, Q. and Zhong, W.L. and Huang, J.Z.",
TITLE = "MIVC: Multiple Instance Visual Component for Visual-Language Models",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "8102-8111",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230106"}
@inproceedings{bb235138,
AUTHOR = "Walmer, M. and Sikka, K. and Sur, I. and Shrivastava, A. and Jha, S.",
TITLE = "Dual-Key Multimodal Backdoors for Visual Question Answering",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "15354-15364",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230107"}
@inproceedings{bb235139,
AUTHOR = "Ding, Y. and Yu, J. and Liu, B. and Hu, Y. and Cui, M.X. and Wu, Q.",
TITLE = "MuKEA: Multimodal Knowledge Extraction and Accumulation for
Knowledge-based Visual Question Answering",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "5079-5088",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230108"}
@inproceedings{bb235140,
AUTHOR = "Gao, F. and Ping, Q. and Thattai, G. and Reganti, A. and Wu, Y.N. and Natarajan, P.",
TITLE = "Transform-Retrieve-Generate: Natural Language-Centric
Outside-Knowledge Visual Question Answering",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "5057-5067",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230109"}
@inproceedings{bb235141,
AUTHOR = "Aflalo, E. and Du, M. and Tseng, S.Y. and Liu, Y.F. and Wu, C. and Duan, N. and Lal, V.",
TITLE = "VL-InterpreT: An Interactive Visualization Tool for Interpreting
Vision-Language Transformers",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "21374-21383",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230110"}
@inproceedings{bb235142,
AUTHOR = "Jain, V. and Lodhavia, J.",
TITLE = "Automatic Question Tagging using k-Nearest Neighbors and Random
Forest",
BOOKTITLE = ISCV20,
YEAR = "2020",
PAGES = "1-4",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT230111"}
@article{bb235143,
AUTHOR = "Ye, Q. and Yu, Z.T. and Shao, R. and Cui, Y.W. and Kang, X. and Liu, X. and Torr, P. and Cao, X.C.",
TITLE = "CAT+: Investigating and Enhancing Audio-Visual Understanding in Large
Language Models",
JOURNAL = PAMI,
VOLUME = "47",
YEAR = "2025",
NUMBER = "10",
MONTH = "October",
PAGES = "8674-8690",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230112"}
@inproceedings{bb235144,
AUTHOR = "Yu, T.Y. and Zhang, H. and Li, Q.M. and Xu, Q.X. and Yao, Y. and Chen, D. and Lu, X.M. and Cui, G. and Dang, Y.K. and He, T. and Feng, X.C. and Song, J. and Zheng, B. and Liu, Z.Y. and Chua, T.S. and Sun, M.S.",
TITLE = "RLAIF-V: Open-Source AI Feedback Leads to Super GPT-4V
Trustworthiness",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "19985-19995",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230113"}
@inproceedings{bb235145,
AUTHOR = "Liang, J. and Huang, W.K. and Wan, G.C. and Yang, Q. and Ye, M.",
TITLE = "LoRASculpt: Sculpting LoRA for Harmonizing General and Specialized
Knowledge in Multimodal Large Language Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "26170-26180",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230114"}
@inproceedings{bb235146,
AUTHOR = "Cao, Y. and Xing, Y. and Zhang, J. and Lin, D. and Zhang, T.W. and Tsang, I. and Liu, Y. and Guo, Q.",
TITLE = "SceneTAP: Scene-Coherent Typographic Adversarial Planner against
Vision-Language Models in Real-World Environments",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "25050-25059",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230115"}
@inproceedings{bb235147,
AUTHOR = "Wang, Y.B. and Guan, J. and Liang, J. and He, R.",
TITLE = "Do We Really Need Curated Malicious Data for Safety Alignment in
Multi-modal Large Language Models?",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "19879-19889",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230116"}
@inproceedings{bb235148,
AUTHOR = "Peng, R. and He, H.Y. and Wei, Y. and Wen, Y.D. and Hu, D.",
TITLE = "Matters: Training-free Fine-grained Image Caption Enhancement via
Local Perception",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3963-3973",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230117"}
@inproceedings{bb235149,
AUTHOR = "Yang, Z. and Luo, X. and Han, D.Q. and Xu, Y.J. and Li, D.S.",
TITLE = "Mitigating Hallucinations in Large Vision-Language Models via DPO:
On-Policy Data Hold the Key",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "10610-10620",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230118"}
@inproceedings{bb235150,
AUTHOR = "Bae, K. and Kim, J. and Lee, S. and Lee, S. and Lee, G. and Choi, J.",
TITLE = "MASH-VLM: Mitigating Action-Scene Hallucination in Video-LLMs through
Disentangled Spatial-Temporal Representations",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "13744-13753",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230119"}
@inproceedings{bb235151,
AUTHOR = "Yin, H. and Si, G.Z. and Wang, Z.",
TITLE = "ClearSight: Visual Signal Enhancement for Object Hallucination
Mitigation in Multimodal Large Language Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "14625-14634",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230120"}
@inproceedings{bb235152,
AUTHOR = "Yang, L. and Zheng, Z.W. and Chen, B. and Zhao, Z.Y. and Lin, C.H. and Shen, C.",
TITLE = "Nullu: Mitigating Object Hallucinations in Large Vision-Language
Models via HalluSpace Projection",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "14635-14645",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230121"}
@inproceedings{bb235153,
AUTHOR = "Wu, Y.C. and Zhang, L. and Yao, H. and Du, J.L. and Yan, K. and Ding, S.H. and Wu, Y.S. and Li, X.Q.",
TITLE = "Antidote: A Unified Framework for Mitigating LVLM Hallucinations in
Counterfactual Presupposition and Object Perception",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "14646-14656",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230122"}
@inproceedings{bb235154,
AUTHOR = "Tu, Y. and Hu, R. and Sang, J.",
TITLE = "ODE: Open-Set Evaluation of Hallucinations in Multimodal Large
Language Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "19836-19845",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230123"}
@inproceedings{bb235155,
AUTHOR = "Liu, J.Z. and Fu, Y.H. and Xie, R. and Xie, R. and Sun, X. and Lian, F.Z. and Kang, Z. and Li, X.R.",
TITLE = "PhD: A ChatGPT-Prompted Visual hallucination Evaluation Dataset",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "19857-19866",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230124"}
@inproceedings{bb235156,
AUTHOR = "Jiang, Z.Q. and Chen, J.K. and Zhu, B. and Luo, T.J. and Shen, Y.K. and Yang, X.",
TITLE = "Devils in Middle Layers of Large Vision-Language Models:
Interpreting, Detecting and Mitigating Object Hallucinations via
Attention Lens",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "25004-25014",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230125"}
@inproceedings{bb235157,
AUTHOR = "Park, E. and Kim, M. and Kim, G.",
TITLE = "HalLoc: Token-level Localization of Hallucinations for Vision
Language Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "29893-29903",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230126"}
@inproceedings{bb235158,
AUTHOR = "Suo, W. and Zhang, L.J. and Sun, M.Y. and Wu, L.Y.B. and Wang, P. and Zhang, Y.N.",
TITLE = "Octopus: Alleviating Hallucination via Dynamic Contrastive Decoding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "29904-29914",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230127"}
@inproceedings{bb235159,
AUTHOR = "An, W.B. and Tian, F. and Leng, S. and Nie, J.H. and Lin, H. and Wang, Q.Y. and Chen, P. and Zhang, X.Q. and Lu, S.J.",
TITLE = "Mitigating Object Hallucinations in Large Vision-Language Models with
Assembly of Global and Local Attention",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "29915-29926",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230128"}
@inproceedings{bb235160,
AUTHOR = "Zhuang, X.W. and Zhu, Z.H. and Xie, Y.X. and Liang, L.M. and Zou, Y.X.",
TITLE = "VASparse: Towards Efficient Visual Hallucination Mitigation via
Visual-Aware Token Sparsification",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "4189-4199",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230129"}
@inproceedings{bb235161,
AUTHOR = "Basak, D. and Bhatt, S. and Kanduri, S. and Desarkar, M.S.",
TITLE = "Aerial Mirage: Unmasking Hallucinations in Large Vision Language
Models",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "5500-5508",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230130"}
@inproceedings{bb235162,
AUTHOR = "Tang, F.L. and Liu, C.Z. and Xu, Z.X. and Hu, M. and Huang, Z. and Xue, H.C. and Chen, Z.Y. and Peng, Z.L. and Yang, Z.W. and Zhou, S.J. and Li, W.X. and Li, Y.L. and Song, W.X. and Su, S.Y. and Feng, W. and Su, J. and Lin, M. and Peng, Y.F. and Cheng, X.L. and Razzak, I. and Ge, Z.Y.",
TITLE = "Seeing Far and Clearly: Mitigating Hallucinations in MLLMs with
Attention Causal Decoding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "26147-26159",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230131"}
@inproceedings{bb235163,
AUTHOR = "Yang, J.N. and Chen, X. and Madaan, N. and Iyengar, M. and Qian, S. and Fouhey, D.F. and Chai, J.",
TITLE = "3D-GRAND: A Million-Scale Dataset for 3D-LLMs with Better Grounding
and Less Hallucination",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "29501-29512",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230132"}
@inproceedings{bb235164,
AUTHOR = "Yoon, D. and Song, Y. and Park, W.",
TITLE = "Stop learning it all to mitigate visual hallucination, Focus on the
hallucination target",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "4200-4208",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230133"}
@inproceedings{bb235165,
AUTHOR = "Chen, J.Z. and Zhang, T.S. and Huang, S.Y. and Niu, Y.W. and Zhang, L.F. and Wen, L.J. and Hu, X.M.",
TITLE = "ICT: Image-Object Cross-Level Trusted Intervention for Mitigating
Object Hallucination in Large Vision-Language Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "4209-4221",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230134"}
@inproceedings{bb235166,
AUTHOR = "Huang, P.H. and Li, J.L. and Chen, C.P. and Chang, M.C. and Chen, W.C.",
TITLE = "Who Brings the Frisbee: Probing Hidden Hallucination Factors in Large
Vision-Language Model via Causality Analysis",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "6125-6135",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230135"}
@inproceedings{bb235167,
AUTHOR = "Liu, S. and Zheng, K. and Chen, W.",
TITLE = "Paying More Attention to Image: A Training-free Method for Alleviating
Hallucination in LVLMS",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXXIII: 125-140",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230136"}
@inproceedings{bb235168,
AUTHOR = "Zhang, J. and Wang, T. and Zhang, H.G. and Lu, P. and Zheng, F.",
TITLE = "Reflective Instruction Tuning: Mitigating Hallucinations in Large
Vision-language Models",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXVIII: 196-213",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230137"}
@inproceedings{bb235169,
AUTHOR = "Kaul, P. and Li, Z.Z. and Yang, H. and Dukler, Y. and Swaminathan, A. and Taylor, C.J. and Soatto, S.",
TITLE = "THRONE: An Object-Based Hallucination Benchmark for the Free-Form
Generations of Large Vision-Language Models",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27218-27228",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230138"}
@inproceedings{bb235170,
AUTHOR = "Jiang, C.Y. and Xu, H.Y. and Dong, M.F. and Chen, J.X. and Ye, W. and Yan, M. and Ye, Q.H. and Zhang, J. and Huang, F. and Zhang, S.K.",
TITLE = "Hallucination Augmented Contrastive Learning for Multimodal Large
Language Model",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27026-27036",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230139"}
@inproceedings{bb235171,
AUTHOR = "Huang, Q.D. and Dong, X.Y. and Zhang, P. and Wang, B. and He, C.H. and Wang, J.Q. and Lin, D. and Zhang, W.M. and Yu, N.H.",
TITLE = "OPERA: Alleviating Hallucination in Multi-Modal Large Language Models
via Over-Trust Penalty and Retrospection-Allocation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13418-13427",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230140"}
@inproceedings{bb235172,
AUTHOR = "Yu, Q.F. and Li, J.C. and Wei, L.H. and Pang, L. and Ye, W.T. and Qin, B.S. and Tang, S.L. and Tian, Q. and Zhuang, Y.T.",
TITLE = "HalluciDoctor: Mitigating Hallucinatory Toxicity in Visual
Instruction Data",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "12944-12953",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230141"}
@inproceedings{bb235173,
AUTHOR = "Favero, A. and Zancato, L. and Trager, M. and Choudhary, S. and Perera, P. and Achille, A. and Swaminathan, A. and Soatto, S.",
TITLE = "Multi-Modal Hallucination Control by Visual Information Grounding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14303-14312",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230142"}
@inproceedings{bb235174,
AUTHOR = "Ouali, Y. and Bulat, A. and Martinez, B. and Tzimiropoulos, G.",
TITLE = "CLIP-DPO: Vision-language Models as a Source of Preference for Fixing
Hallucinations in LVLMS",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXVI: 395-413",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230143"}
@inproceedings{bb235175,
AUTHOR = "Ye Bin, M. and Hyeon Woo, N. and Choi, W. and Oh, T.H.",
TITLE = "Beaf: Observing Before-after Changes to Evaluate Hallucination in
Vision-language Models",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XI: 232-248",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230144"}
@inproceedings{bb235176,
AUTHOR = "Kim, M. and Kim, M. and Bae, J. and Choi, S. and Kim, S. and Chang, B.",
TITLE = "Exploiting Semantic Reconstruction to Mitigate Hallucinations in
Vision-language Models",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXXVI: 236-252",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230145"}
@inproceedings{bb235177,
AUTHOR = "Guan, T.R. and Liu, F. and Wu, X. and Xian, R.Q. and Li, Z.X. and Liu, X.Y. and Wang, X. and Chen, L. and Huang, F. and Yacoob, Y. and Manocha, D. and Zhou, T.Y.",
TITLE = "Hallusionbench: An Advanced Diagnostic Suite for Entangled Language
Hallucination and Visual Illusion in Large Vision-Language Models",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14375-14385",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230146"}
@inproceedings{bb235178,
AUTHOR = "Leng, S. and Zhang, H. and Chen, G.Z. and Li, X. and Lu, S.J. and Miao, C.Y. and Bing, L.",
TITLE = "Mitigating Object Hallucinations in Large Vision-Language Models
through Visual Contrastive Decoding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13872-13882",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230147"}
@inproceedings{bb235179,
AUTHOR = "Wang, Z. and Bingham, G. and Yu, A.W. and Le, Q.V. and Luong, T. and Ghiasi, G.",
TITLE = "Haloquest: A Visual Hallucination Dataset for Advancing Multimodal
Reasoning",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXVII: 288-304",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230148"}
@inproceedings{bb235180,
AUTHOR = "Wang, T.J.J. and Laaksonen, J. and Langer, T. and Arponen, H. and Bishop, T.E.",
TITLE = "Learning by Hallucinating:
Vision-Language Pre-training with Weak Supervision",
BOOKTITLE = WACV23,
YEAR = "2023",
PAGES = "1073-1083",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT230149"}
@article{bb235181,
AUTHOR = "Wang, Y.Z. and Hu, W.B. and Dong, Y.P. and Liu, J. and Zhang, H.W. and Hong, R.C.",
TITLE = "Align Is Not Enough: Multimodal Universal Jailbreak Attack Against
Multimodal Large Language Models",
JOURNAL = CirSysVideo,
VOLUME = "35",
YEAR = "2025",
NUMBER = "6",
MONTH = "June",
PAGES = "5475-5488",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT230150"}
@inproceedings{bb235182,
AUTHOR = "Hossain, M.Z. and Imteaj, A.",
TITLE = "SLADE: Shielding against Dual Exploits in Large Vision-Language
Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "24244-24254",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT230151"}
@inproceedings{bb235183,
AUTHOR = "Jeong, J. and Bae, S. and Jung, Y. and Hwang, J. and Yang, E.",
TITLE = "Playing the Fool: Jailbreaking LLMs and Multimodal LLMs with
Out-of-Distribution Strategy",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "29937-29946",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT230152"}
@inproceedings{bb235184,
AUTHOR = "Yang, Z.P. and Fan, J. and Yan, A. and Gao, E. and Lin, X. and Li, T. and Mo, K. and Dong, C.",
TITLE = "Distraction is All You Need for Multimodal Large Language Model
Jailbreaking",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "9467-9476",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT230153"}
@inproceedings{bb235185,
AUTHOR = "Hao, S.Y. and Hooi, B. and Liu, J. and Chang, K.W. and Huang, Z. and Cai, Y.J.",
TITLE = "Exploring Visual Vulnerabilities via Multi-Loss Adversarial Search
for Jailbreaking Vision-Language Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "19890-19899",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT230154"}
@inproceedings{bb235186,
AUTHOR = "Wang, H. and Wang, G. and Zhang, H.",
TITLE = "Steering Away from Harm: An Adaptive Approach to Defending Vision
Language Model Against Jailbreaks",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "29947-29957",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT230155"}
@inproceedings{bb235187,
AUTHOR = "Ghosal, S.S. and Chakraborty, S. and Singh, V. and Guan, T.R. and Wang, M. and Beirami, A. and Huang, F. and Velasquez, A. and Manocha, D. and Bedi, A.S.",
TITLE = "Immune: Improving Safety Against Jailbreaks in Multi-modal LLMs via
Inference-Time Alignment",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "25038-25049",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT230156"}
@inproceedings{bb235188,
AUTHOR = "Xiang, Y.L. and Hong, Z.M. and Yao, L. and Wang, D.D. and Liu, T.L.",
TITLE = "Jailbreaking the Non-Transferable Barrier via Test-Time Data
Disguising",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "30671-30681",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT230157"}
@inproceedings{bb235189,
AUTHOR = "Chen, J.X. and Dong, J.H. and Xie, X.H.",
TITLE = "Mind the Trojan Horse: Image Prompt Adapter Enabling Scalable and
Deceptive Jailbreaking",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "23785-23794",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT230158"}
@inproceedings{bb235190,
AUTHOR = "Li, Y.F. and Guo, H. and Zhou, K. and Zhao, W.X. and Wen, J.R.",
TITLE = "Images are Achilles' Heel of Alignment: Exploiting Visual
Vulnerabilities for Jailbreaking Multimodal Large Language Models",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXIII: 174-189",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT230159"}
@article{bb235191,
AUTHOR = "Wu, Y.C. and Yang, J.C.",
TITLE = "A Robust Passage Retrieval Algorithm for Video Question Answering",
JOURNAL = CirSysVideo,
VOLUME = "18",
YEAR = "2008",
NUMBER = "10",
MONTH = "October",
PAGES = "1411-1421",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT230160"}
@inproceedings{bb235192,
AUTHOR = "Wu, Y.C. and Lee, Y.S. and Yang, J.C. and Yen, S.J.",
TITLE = "A New Passage Ranking Algorithm for Video Question Answering",
BOOKTITLE = PSIVT06,
YEAR = "2006",
PAGES = "563-572",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT230161"}
@article{bb235193,
AUTHOR = "Li, G.D. and Li, H.J. and Ming, Z.Y. and Hong, R.C. and Tang, S. and Chua, T.S.",
TITLE = "Question Answering over Community-Contributed Web Videos",
JOURNAL = MultMedMag,
VOLUME = "17",
YEAR = "2010",
NUMBER = "4",
MONTH = "October",
PAGES = "46-57",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT230162"}
@inproceedings{bb235194,
AUTHOR = "Song, Y.C. and Li, H.J.",
TITLE = "Mash-Up Approach for Web Video Category Recommendation",
BOOKTITLE = PSIVT10,
YEAR = "2010",
PAGES = "197-202",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT230163"}
@article{bb235195,
AUTHOR = "Guo, Z.Y. and Zhao, Z. and Jin, W. and Wei, Z.C. and Yang, M. and Wang, N.N. and Yuan, N.J.",
TITLE = "Multi-Turn Video Question Generation via Reinforced Multi-Choice
Attention Network",
JOURNAL = CirSysVideo,
VOLUME = "31",
YEAR = "2021",
NUMBER = "5",
PAGES = "1697-1710",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT230164"}
@article{bb235196,
AUTHOR = "Xue, H.Y. and Chu, W. and Zhao, Z. and Cai, D.",
TITLE = "A Better Way to Attend: Attention With Trees for Video Question
Answering",
JOURNAL = IP,
VOLUME = "27",
YEAR = "2018",
NUMBER = "11",
MONTH = "November",
PAGES = "5563-5574",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT230165"}
@article{bb235197,
AUTHOR = "Xue, H.Y. and Zhao, Z. and Cai, D.",
TITLE = "Unifying the Video and Question Attentions for Open-Ended Video
Question Answering",
JOURNAL = IP,
VOLUME = "26",
YEAR = "2017",
NUMBER = "12",
MONTH = "December",
PAGES = "5656-5666",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT230166"}
@article{bb235198,
AUTHOR = "Zhao, Z. and Xiao, S.W. and Song, Z. and Lu, C.J. and Xiao, J. and Zhuang, Y.T.",
TITLE = "Open-Ended Video Question Answering via Multi-Modal Conditional
Adversarial Networks",
JOURNAL = IP,
VOLUME = "29",
YEAR = "2020",
PAGES = "3859-3870",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT230167"}
@article{bb235199,
AUTHOR = "Zhao, Z. and Zhang, Z. and Xiao, S.W. and Xiao, Z.X. and Yan, X.H. and Yu, J. and Cai, D. and Wu, F.",
TITLE = "Long-Form Video Question Answering via Dynamic Hierarchical
Reinforced Networks",
JOURNAL = IP,
VOLUME = "28",
YEAR = "2019",
NUMBER = "12",
MONTH = "December",
PAGES = "5939-5952",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT230168"}
Last update:Oct 20, 2025 at 16:58:17