@inproceedings{bb242800,
        AUTHOR = "Chen, B. and Shvetsova, N. and Rouditchenko, A. and Kondermann, D. and Thomas, S. and Chang, S.F. and Feris, R. and Glass, J. and Kuehne, H.",
        TITLE = "What, When, and Where? Self-Supervised Spatio- Temporal Grounding in
Untrimmed Multi-Action Videos from Narrated Instructions",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18419-18429",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237714"}

@inproceedings{bb242801,
        AUTHOR = "Wasim, S.T. and Naseer, M. and Khan, S. and Yang, M.H. and Khan, F.S.",
        TITLE = "VideoGrounding-DINO: Towards Open-Vocabulary Spatio- Temporal Video
Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18909-18918",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237715"}

@inproceedings{bb242802,
        AUTHOR = "de la Jara, I.M. and Rodriguez Opazo, C. and Marrese Taylor, E. and Bravo Marquez, F.",
        TITLE = "An empirical study of the effect of video encoders on Temporal Video
Grounding",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2842-2847",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237716"}

@inproceedings{bb242803,
        AUTHOR = "Li, H.X. and Cao, M. and Cheng, X. and Li, Y.W. and Zhu, Z.H. and Zou, Y.X.",
        TITLE = "G2L: Semantically Aligned and Uniform Video Grounding via Geodesic
and Game Theory",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "11998-12008",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237717"}

@inproceedings{bb242804,
        AUTHOR = "Li, H. and Shu, X.J. and He, S. and Qiao, R.Z. and Wen, W. and Guo, T. and Gan, B. and Sun, X.",
        TITLE = "D3G: Exploring Gaussian Prior for Temporal Sentence Grounding with
Glance Annotation",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13688-13700",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237718"}

@inproceedings{bb242805,
        AUTHOR = "Pan, Y.L. and He, X.T. and Gong, B. and Lv, Y.L. and Shen, Y.J. and Peng, Y.X. and Zhao, D.L.",
        TITLE = "Scanning Only Once: An End-to-end Framework for Fast Temporal
Grounding in Long Videos",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13721-13731",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237719"}

@inproceedings{bb242806,
        AUTHOR = "Jang, J. and Park, J. and Kim, J. and Kwon, H. and Sohn, K.H.",
        TITLE = "Knowing Where to Focus: Event-aware Transformer for Video Grounding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13800-13810",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237720"}

@inproceedings{bb242807,
        AUTHOR = "Cao, M. and Wei, F.Y. and Xu, C. and Geng, X. and Chen, L. and Zhang, C. and Zou, Y.X. and Shen, T. and Jiang, D.X.",
        TITLE = "Iterative Proposal Refinement for Weakly-Supervised Video Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6524-6534",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237721"}

@inproceedings{bb242808,
        AUTHOR = "Lu, Z.J. and Iftekhar, A.S.M. and Mittal, G. and Meng, T.J. and Wang, X. and Zhao, C. and Kukkala, R. and Elhamifar, E. and Chen, M.",
        TITLE = "DeCafNet: Delegate and Conquer for Efficient Temporal Grounding in
Long Videos",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "24066-24076",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237722"}

@inproceedings{bb242809,
        AUTHOR = "Wang, L. and Mittal, G. and Sajeev, S. and Yu, Y. and Hall, M. and Boddeti, V.N. and Chen, M.",
        TITLE = "ProTéGé: Untrimmed Pretraining for Video Temporal Grounding by Video
Temporal Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6575-6585",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237723"}

@inproceedings{bb242810,
        AUTHOR = "Chen, J. and Gao, D.F. and Lin, K.Q.H. and Shou, M.Z.",
        TITLE = "Affordance Grounding from Demonstration Video to Target Image",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6799-6808",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237724"}

@inproceedings{bb242811,
        AUTHOR = "Zhang, Y.M. and Chen, X. and Jia, J.H. and Liu, S. and Ding, K.",
        TITLE = "Text-Visual Prompting for Efficient 2D Temporal Video Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14794-14804",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237725"}

@inproceedings{bb242812,
        AUTHOR = "Li, M.Z. and Wang, H. and Zhang, W.Q. and Miao, J.X. and Zhao, Z. and Zhang, S.Y. and Ji, W. and Wu, F.",
        TITLE = "WINNER: Weakly-supervised hIerarchical decompositioN and aligNment
for spatio-tEmporal video gRounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23090-23099",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237726"}

@inproceedings{bb242813,
        AUTHOR = "Lin, Z.H. and Tan, C.L. and Hu, J.F. and Jin, Z. and Ye, T. and Zheng, W.S.",
        TITLE = "Collaborative Static and Dynamic Vision-Language Streams for
Spatio-Temporal Video Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23100-23109",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237727"}

@inproceedings{bb242814,
        AUTHOR = "Yang, L. and Kong, Q. and Yang, H.K. and Kehl, W. and Sato, Y. and Kobori, N.",
        TITLE = "DeCo: Decomposition and Reconstruction for Compositional Temporal
Grounding via Coarse-to-Fine Contrastive Ranking",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23130-23140",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237728"}

@inproceedings{bb242815,
        AUTHOR = "Kim, D. and Park, J. and Lee, J.Y. and Park, S. and Sohn, K.H.",
        TITLE = "Language-free Training for Zero-shot Video Grounding",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "2538-2547",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237729"}

@inproceedings{bb242816,
        AUTHOR = "Dvornik, N. and Hadji, I. and Pham, H. and Bhatt, D. and Martinez, B. and Fazly, A. and Jepson, A.D.",
        TITLE = "Flow Graph to Video Grounding for Weakly-Supervised Multi-step
Localization",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:319-335",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237730"}

@inproceedings{bb242817,
        AUTHOR = "Xiong, Z. and Liu, D. and Zhou, P.",
        TITLE = "Gaussian Kernel-Based Cross Modal Network for Spatio-Temporal Video
Grounding",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "2481-2485",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237731"}

@inproceedings{bb242818,
        AUTHOR = "Ding, X.P. and Wang, N.N. and Zhang, S.W. and Cheng, D. and Li, X.M. and Huang, Z.Y. and Tang, M.Q. and Gao, X.B.",
        TITLE = "Support-Set Based Cross-Supervision for Video Grounding",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "11553-11562",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237732"}

@inproceedings{bb242819,
        AUTHOR = "Su, R. and Yu, Q. and Xu, D.",
        TITLE = "STVGBert: A Visual-linguistic Transformer based Framework for
Spatio-temporal Video Grounding",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1513-1522",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237733"}

@inproceedings{bb242820,
        AUTHOR = "Soldan, M. and Xu, M.M. and Qu, S. and Tegner, J. and Ghanem, B.",
        TITLE = "VLG-Net: Video-Language Graph Matching Network for Video Grounding",
        BOOKTITLE = CVEU21,
        YEAR = "2021",
        PAGES = "3217-3227",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237734"}

@inproceedings{bb242821,
        AUTHOR = "Nan, G.S. and Qiao, R. and Xiao, Y. and Liu, J. and Leng, S.C. and Zhang, H. and Lu, W.",
        TITLE = "Interventional Video Grounding with Dual Contrastive Learning",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "2764-2774",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237735"}

@inproceedings{bb242822,
        AUTHOR = "Zhao, Y. and Zhao, Z. and Zhang, Z. and Lin, Z.J.",
        TITLE = "Cascaded Prediction Network via Segment Tree for Temporal Video
Grounding",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "4195-4204",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237736"}

@inproceedings{bb242823,
        AUTHOR = "Zhang, Z. and Zhao, Z. and Zhao, Y. and Wang, Q. and Liu, H. and Gao, L.",
        TITLE = "Where Does It Exist: Spatio-Temporal Video Grounding for Multi-Form
Sentences",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10665-10674",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237737"}

@inproceedings{bb242824,
        AUTHOR = "Zeng, R.H. and Xu, H.M. and Huang, W.B. and Chen, P.H. and Tan, M.K. and Gan, C.",
        TITLE = "Dense Regression Network for Video Grounding",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10284-10293",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237738"}

@inproceedings{bb242825,
        AUTHOR = "Shi, J. and Xu, J. and Gong, B.Q. and Xu, C.L.",
        TITLE = "Not All Frames Are Equal: Weakly-Supervised Video Grounding With
Contextual Similarity and Visual Clustering Losses",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "10436-10444",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidgr3.html#TT237739"}

@article{bb242826,
        AUTHOR = "Wang, Y.C. and Deng, J.J. and Zhou, W.G. and Li, H.Q.",
        TITLE = "Weakly Supervised Temporal Adjacent Network for Language Grounding",
        JOURNAL = MultMed,
        VOLUME = "24",
        YEAR = "2022",
        PAGES = "3276-3286",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237740"}

@article{bb242827,
        AUTHOR = "Tang, H.Y. and Zhu, J. and Wang, L. and Zheng, Q.H. and Zhang, T.W.",
        TITLE = "Multi-Level Query Interaction for Temporal Language Grounding",
        JOURNAL = ITS,
        VOLUME = "23",
        YEAR = "2022",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "25479-25488",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237741"}

@article{bb242828,
        AUTHOR = "Zeng, Y.W. and Han, N. and Pan, K.Y. and Jin, Q.",
        TITLE = "Temporally Language Grounding With Multi-Modal Multi-Prompt Tuning",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "3366-3377",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237742"}

@article{bb242829,
        AUTHOR = "Zhang, T. and Lu, X.K. and Zhang, H. and Nie, X.S. and Yin, Y.L. and Shen, J.B.",
        TITLE = "Relational Network via Cascade CRF for Video Language Grounding",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "8297-8311",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237743"}

@article{bb242830,
        AUTHOR = "Dong, J.X. and Yin, Z.Z.",
        TITLE = "Annotation-Efficient Hybrid Learning for Temporal Sentence Grounding",
        JOURNAL = CirSysVideo,
        VOLUME = "36",
        YEAR = "2026",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "2594-2606",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237744"}

@inproceedings{bb242831,
        AUTHOR = "Shen, S. and Zhu, Z. and Fan, L.Q. and Zhang, H. and Wu, X.X.",
        TITLE = "DiffCLIP: Leveraging Stable Diffusion for Language Grounded 3D
Classification",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "3584-3593",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237745"}

@inproceedings{bb242832,
        AUTHOR = "Dong, P.J. and Yang, X.F. and Wang, Q. and Li, Z.X. and Li, T. and Chu, X.W.",
        TITLE = "Multi-task Domain Adaptation for Language Grounding with 3d Objects",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXXIV: 387-404",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237746"}

@inproceedings{bb242833,
        AUTHOR = "Hegde, D. and Valanarasu, J.M.J. and Patel, V.M.",
        TITLE = "CLIP goes 3D: Leveraging Prompt Tuning for Language Grounded 3D
Recognition",
        BOOKTITLE = OpenSUN3D,
        PAGES = "2020-2030",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237747"}

@inproceedings{bb242834,
        AUTHOR = "Jain, A. and Gkanatsios, N. and Mediratta, I. and Fragkiadaki, K.",
        TITLE = "Bottom Up Top Down Detection Transformers for Language Grounding in
Images and Point Clouds",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:417-433",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237748"}

@inproceedings{bb242835,
        AUTHOR = "Heisler, M. and Banitalebi Dehkordi, A. and Zhang, Y.",
        TITLE = "SemAug: Semantically Meaningful Image Augmentations for Object
Detection Through Language Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:610-626",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237749"}

@inproceedings{bb242836,
        AUTHOR = "Soldan, M. and Pardo, A. and Alcazar, J.L. and Heilbron, F.C. and Zhao, C. and Giancola, S. and Ghanem, B.",
        TITLE = "MAD: A Scalable Dataset for Language Grounding in Videos from Movie
Audio Descriptions",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5016-5025",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237750"}

@inproceedings{bb242837,
        AUTHOR = "Prabhudesai, M. and Tung, H.Y.F. and Javed, S.A. and Sieb, M. and Harley, A.W. and Fragkiadaki, K.",
        TITLE = "Embodied Language Grounding With 3D Visual Feature Representations",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "2217-2226",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237751"}

@inproceedings{bb242838,
        AUTHOR = "Bajaj, M. and Wang, L. and Sigal, L.",
        TITLE = "G3raphGround: Graph-Based Language Grounding",
        BOOKTITLE = ICCV19,
        YEAR = "2019",
        PAGES = "4280-4289",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803lagr3.html#TT237752"}

@article{bb242839,
        AUTHOR = "Chen, Z.X. and Bie, Y. and Jin, H.B. and Chen, H.",
        TITLE = "Large Language Model With Region-Guided Referring and Grounding for
CT Report Generation",
        JOURNAL = MedImg,
        VOLUME = "44",
        YEAR = "2025",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "3139-3150",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237753"}

@article{bb242840,
        AUTHOR = "Liu, Y. and Hou, H.W. and Ma, F. and Ni, S.G. and Yu, F.R.",
        TITLE = "MLLM-TA: Leveraging Multimodal Large Language Models for Precise
Temporal Video Grounding",
        JOURNAL = SPLetters,
        VOLUME = "32",
        YEAR = "2025",
        PAGES = "281-285",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237754"}

@article{bb242841,
        AUTHOR = "Li, G.Z. and Ding, X.P. and Cheng, D. and Li, J. and Wang, N.N. and Gao, X.B.",
        TITLE = "ETC: Temporal Boundary Expand Then Clarify for Weakly Supervised
Video Grounding With Multimodal Large Language Model",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "1772-1782",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237755"}

@article{bb242842,
        AUTHOR = "Wu, J.L. and Liu, W. and Liu, Y. and Liu, M. and Nie, L.Q. and Lin, Z.C. and Chen, C.W.",
        TITLE = "A Survey on Video Temporal Grounding With Multimodal Large Language
Model",
        JOURNAL = PAMI,
        VOLUME = "48",
        YEAR = "2026",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "1521-1541",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237756"}

@article{bb242843,
        AUTHOR = "Wang, P. and Liang, Y.X. and Cen, Y.G. and Cen, L.H. and Qu, Z. and Liu, J.L. and Kan, S.C.",
        TITLE = "Integrating spatial features and dynamically learned temporal
features via contrastive learning for video temporal grounding in LLM",
        JOURNAL = IVC,
        VOLUME = "167",
        YEAR = "2026",
        PAGES = "105895",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237757"}

@inproceedings{bb242844,
        AUTHOR = "Liu, Y. and Jiang, L. and Li, G.M. and Ye, X.Z. and Ouyang, Y.",
        TITLE = "YOLO-VG: Enhancing Multi-Stage Feature Interaction for Visual
Grounding",
        BOOKTITLE = ICIP25,
        YEAR = "2025",
        PAGES = "469-473",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237758"}

@inproceedings{bb242845,
        AUTHOR = "Gao, J. and Li, Y.Q. and Cao, Z.Q. and Li, W.J.",
        TITLE = "Interleaved-Modal Chain-of-Thought",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "19520-19529",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237759"}

@inproceedings{bb242846,
        AUTHOR = "Yu, C.L. and Wang, H.Q. and Shi, Y. and Luo, H.Y. and Yang, S. and Yu, J.Y. and Wang, J.Y.",
        TITLE = "SeqAfford: Sequential 3D Affordance Reasoning via Multimodal Large
Language Model",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "1691-1701",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237760"}

@inproceedings{bb242847,
        AUTHOR = "Huang, Y. and Gao, T.Y. and Xu, H.R. and Zhao, Q.H. and Song, Y. and Gui, Z.P. and Lv, T.C. and Chen, H. and Cui, L. and Li, S. and Wei, F.",
        TITLE = "PEACE: Empowering Geologic Map Holistic Understanding with MLLMs",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "3899-3908",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237761"}

@inproceedings{bb242848,
        AUTHOR = "Chen, W.B. and Xu, Z. and Xu, R. and Wu, S. and Wong, H.S.",
        TITLE = "Task-aware Cross-modal Feature Refinement Transformer with Large
Language Models for Visual Grounding",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "3931-3941",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237762"}

@inproceedings{bb242849,
        AUTHOR = "Wu, S. and Jin, S. and Zhang, W.W. and Xu, L. and Liu, W.T. and Li, W. and Loy, C.C.",
        TITLE = "F-LMM: Grounding Frozen Large Multimodal Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "24710-24721",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237763"}

@inproceedings{bb242850,
        AUTHOR = "Qian, R. and Yin, X. and Dou, D.",
        TITLE = "Reasoning to Attend: Try to Understand How  Token Works",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "24722-24731",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237764"}

@inproceedings{bb242851,
        AUTHOR = "Chen, Y.Y. and Xu, D.X. and Huang, Y. and Zhan, S.K. and Wang, H. and Chen, D.X. and Wang, X.P. and Qiu, M.K. and Li, H.",
        TITLE = "MIMO: A medical vision language model with visual referring
multimodal input and pixel grounding multimodal output",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "24732-24741",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237765"}

@inproceedings{bb242852,
        AUTHOR = "Huang, H.F. and Chen, X. and Chen, Y.L. and Li, H. and Han, X. and Wang, Z. and Wang, T. and Pang, J.M. and Zhao, Z.",
        TITLE = "RoboGround: Robotic Manipulation with Grounded Vision-Language Priors",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "22540-22550",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237766"}

@inproceedings{bb242853,
        AUTHOR = "Man, Y.Z. and Huang, D.A. and Liu, G.L. and Sheng, S.W. and Liu, S.L. and Gui, L.Y. and Kautz, J. and Wang, Y.X. and Yu, Z.",
        TITLE = "Argus: Vision-Centric Reasoning with Grounded Chain-of-Thought",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "14268-14280",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237767"}

@inproceedings{bb242854,
        AUTHOR = "Yin, H. and Ren, Y.Q. and Yan, K. and Ding, S.H. and Hao, Y.T.",
        TITLE = "ROD-MLLM: Towards More Reliable Object Detection in Multimodal Large
Language Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "14358-14368",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237768"}

@inproceedings{bb242855,
        AUTHOR = "Liao, Y.H. and Mahmood, R. and Fidler, S. and Acuna, D.",
        TITLE = "Can Large Vision-Language Models Correct Semantic Grounding Errors By
Themselves?",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "14667-14678",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237769"}

@inproceedings{bb242856,
        AUTHOR = "Yuan, Z.H. and Peng, Y. and Ren, J. and Liao, Y.H. and Han, Y. and Feng, C.M. and Zhao, H.S. and Li, G.B. and Cui, S.G. and Li, Z.",
        TITLE = "Empowering Large Language Models with 3D Situation Awareness",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "19435-19445",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237770"}

@inproceedings{bb242857,
        AUTHOR = "Kang, S. and Kim, J. and Kim, J. and Hwang, S.J.",
        TITLE = "Your Large Vision-Language Model Only Needs A Few Attention Heads For
Visual Grounding",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "9339-9350",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237771"}

@inproceedings{bb242858,
        AUTHOR = "Liu, Q.Y. and Zhang, S.Q. and Qiao, Y.Y. and Zhu, J.Y. and Li, X. and Guo, L.T. and Wang, Q. and He, X.J. and Wu, Q. and Liu, J.",
        TITLE = "GroundingMate: Aiding Object Grounding for Goal-Oriented
Vision-and-Language Navigation",
        BOOKTITLE = WACV25,
        YEAR = "2025",
        PAGES = "1775-1784",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237772"}

@inproceedings{bb242859,
        AUTHOR = "Yan, S. and Bai, M. and Chen, W.F. and Zhou, X. and Huang, Q.X. and Li, L.E.",
        TITLE = "Vigor: Improving Visual Grounding of Large Vision Language Models with
Fine-grained Reward Modeling",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXI: 37-53",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237773"}

@inproceedings{bb242860,
        AUTHOR = "Chowdhury, S. and Nag, S. and Dasgupta, S. and Chen, J. and Elhoseiny, M. and Gao, R.H. and Manocha, D.",
        TITLE = "Meerkat: Audio-visual Large Language Model for Grounding in Space and
Time",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXIV: 52-70",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237774"}

@inproceedings{bb242861,
        AUTHOR = "Kuckreja, K. and Danish, M.S. and Naseer, M. and Das, A. and Khan, S. and Khan, F.S.",
        TITLE = "GeoChat: Grounded Large Vision-Language Model for Remote Sensing",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27831-27840",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237775"}

@inproceedings{bb242862,
        AUTHOR = "Song, C.H. and Sadler, B.M. and Wu, J. and Chao, W.L. and Washington, C. and Su, Y.",
        TITLE = "LLM-Planner: Few-Shot Grounded Planning for Embodied Agents with
Large Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2986-2997",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237776"}

@inproceedings{bb242863,
        AUTHOR = "You, K. and Zhang, H.T. and Schoop, E. and Weers, F. and Swearngin, A. and Nichols, J. and Yang, Y.F. and Gan, Z.",
        TITLE = "FERRET-UI: Grounded Mobile UI Understanding with Multimodal LLMs",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXIV: 240-255",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237777"}

@inproceedings{bb242864,
        AUTHOR = "Tong, S.B. and Liu, Z. and Zhai, Y.X. and Ma, Y. and LeCun, Y. and Xie, S.",
        TITLE = "Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "9568-9578",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237778"}

@inproceedings{bb242865,
        AUTHOR = "Xu, J.R. and Zhou, X.Y. and Yan, S. and Gu, X. and Arnab, A. and Sun, C. and Wang, X.L. and Schmid, C.",
        TITLE = "Pixel Aligned Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13030-13039",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237779"}

@inproceedings{bb242866,
        AUTHOR = "Wu, P.H. and Xie, S.",
        TITLE = "V*: Guided Visual Search as a Core Mechanism in Multimodal LLMs",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13084-13094",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237780"}

@inproceedings{bb242867,
        AUTHOR = "He, R. and Cascante Bonilla, P. and Yang, Z.Y. and Berg, A.C. and Ordonez, V.",
        TITLE = "Improved Visual Grounding through Self-Consistent Explanations",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13095-13105",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237781"}

@inproceedings{bb242868,
        AUTHOR = "Feng, C. and Hsu, J. and Liu, W.Y. and Wu, J.J.",
        TITLE = "Naturally Supervised 3D Visual Grounding with Language-Regularized
Concept Learners",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13269-13278",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237782"}

@inproceedings{bb242869,
        AUTHOR = "He, J.W. and Wang, Y.F. and Wang, L.J. and Lu, H.C. and He, J.Y. and Lan, J.P. and Luo, B. and Xie, X.",
        TITLE = "Multi-Modal Instruction Tuned LLMs with Fine-Grained Visual
Perception",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13980-13990",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237783"}

@inproceedings{bb242870,
        AUTHOR = "Yuan, Z.H. and Ren, J. and Feng, C.M. and Zhao, H.S. and Cui, S.G. and Li, Z.",
        TITLE = "Visual Programming for Zero-Shot Open-Vocabulary 3D Visual Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "20623-20633",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237784"}

@inproceedings{bb242871,
        AUTHOR = "Chen, G. and Shen, L. and Shao, R. and Deng, X. and Nie, L.Q.",
        TITLE = "LION: Empowering Multimodal Large Language Model with Dual-Level
Visual Knowledge",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26530-26540",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237785"}

@inproceedings{bb242872,
        AUTHOR = "Qu, M.X. and Chen, X.D. and Liu, W. and Li, A. and Zhao, Y.",
        TITLE = "ChatVTG: Video Temporal Grounding via Chat with Video Dialogue Large
Language Models",
        BOOKTITLE = PVUW24,
        YEAR = "2024",
        PAGES = "1847-1856",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237786"}

@inproceedings{bb242873,
        AUTHOR = "Zhang, Y. and Ma, Z.Q. and Gao, X.F. and Shakiah, S. and Gao, Q. and Chai, J.",
        TITLE = "Groundhog Grounding Large Language Models to Holistic Segmentation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14227-14238",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237787"}

@inproceedings{bb242874,
        AUTHOR = "Kim, K. and Yoon, K. and Jeon, J. and In, Y. and Moon, J. and Kim, D.H. and Park, C.",
        TITLE = "LLM4SGG: Large Language Models for Weakly Supervised Scene Graph
Generation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "28306-28316",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT237788"}

@article{bb242875,
        AUTHOR = "Liang, J.W. and Jiang, L. and Cao, L.L. and Kalantidis, Y. and Li, L.J. and Hauptmann, A.G.",
        TITLE = "Focal Visual-Text Attention for Memex Question Answering",
        JOURNAL = PAMI,
        VOLUME = "41",
        YEAR = "2019",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "1893-1908",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237790"}

@inproceedings{bb242876,
        AUTHOR = "Liang, J.W. and Jiang, L. and Cao, L.L. and Li, L.J. and Hauptmann, A.G.",
        TITLE = "Focal Visual-Text Attention for Visual Question Answering",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "6135-6143",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237791"}

@article{bb242877,
        AUTHOR = "Riquelme, F. and de Goyeneche, A. and Zhang, Y.D. and Niebles, J.C. and Soto, A.",
        TITLE = "Explaining VQA predictions using visual grounding and a knowledge
base",
        JOURNAL = IVC,
        VOLUME = "101",
        YEAR = "2020",
        PAGES = "103968",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237792"}

@article{bb242878,
        AUTHOR = "Plummer, B.A. and Shih, K.J. and Li, Y.C. and Xu, K. and Lazebnik, S. and Sclaroff, S. and Saenko, K.",
        TITLE = "Revisiting Image-Language Networks for Open-Ended Phrase Detection",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "2155-2167",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237793"}

@inproceedings{bb242879,
        AUTHOR = "Burns, A. and Tan, R. and Saenko, K. and Sclaroff, S. and Plummer, B.A.",
        TITLE = "Language Features Matter: Effective Language Representations for
Vision-Language Tasks",
        BOOKTITLE = ICCV19,
        YEAR = "2019",
        PAGES = "7473-7482",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237794"}

@inproceedings{bb242880,
        AUTHOR = "Arbelle, A. and Doveh, S. and Alfassy, A. and Shtok, J. and Lev, G. and Schwartz, E. and Kuehne, H. and Levi, H.B. and Sattigeri, P. and Panda, R. and Chen, C.F. and Bronstein, A.M. and Saenko, K. and Ullman, S. and Giryes, R. and Feris, R.S. and Karlinsky, L.",
        TITLE = "Detector-Free Weakly Supervised Grounding by Separation",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1781-1792",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237795"}

@inproceedings{bb242881,
        AUTHOR = "Whitehead, S. and Wu, H. and Ji, H. and Feris, R.S. and Saenko, K.",
        TITLE = "Separating Skills and Concepts for Novel Visual Question Answering",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "5628-5637",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237796"}

@article{bb242882,
        AUTHOR = "Zhao, L.C. and Cai, D.G. and Zhang, J. and Sheng, L. and Xu, D. and Zheng, R. and Zhao, Y.J. and Wang, L.P. and Fan, X.",
        TITLE = "Toward Explainable 3D Grounded Visual Question Answering: A New
Benchmark and Strong Baseline",
        JOURNAL = CirSysVideo,
        VOLUME = "33",
        YEAR = "2023",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "2935-2949",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237797"}

@article{bb242883,
        AUTHOR = "Zhu, L.J. and Peng, L. and Zhou, W.N. and Yang, J.L.",
        TITLE = "Dual-decoder transformer network for answer grounding in visual
question answering",
        JOURNAL = PRL,
        VOLUME = "171",
        YEAR = "2023",
        PAGES = "53-60",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237798"}

@article{bb242884,
        AUTHOR = "Li, Y.C. and Wang, X. and Xiao, J.B. and Ji, W. and Chua, T.S.",
        TITLE = "Transformer-Empowered Invariant Grounding for Video Question
Answering",
        JOURNAL = PAMI,
        VOLUME = "47",
        YEAR = "2025",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "9510-9522",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237799"}

@inproceedings{bb242885,
        AUTHOR = "Li, Y.C. and Wang, X. and Xiao, J.B. and Ji, W. and Chua, T.S.",
        TITLE = "Invariant Grounding for Video Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "2918-2927",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237800"}

@inproceedings{bb242886,
        AUTHOR = "Huang, J.Y. and Jia, B.X. and Wang, Y. and Zhu, Z.Y. and Linghu, X.K. and Li, Q. and Zhu, S.C. and Huang, S.Y.",
        TITLE = "Unveiling the Mist over 3D Vision-Language Understanding:
Object-centric Evaluation with Chain-of-Analysis",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "24570-24581",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237801"}

@inproceedings{bb242887,
        AUTHOR = "Chen, K. and Wu, X.Q.",
        TITLE = "VTQA: Visual Text Question Answering via Entity Alignment and
Cross-Media Reasoning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27208-27217",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237802"}

@inproceedings{bb242888,
        AUTHOR = "Di, S.Z. and Xie, W.",
        TITLE = "Grounded Question-Answering in Long Egocentric Videos",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12934-12943",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237803"}

@inproceedings{bb242889,
        AUTHOR = "Chen, C.Y. and Anjum, S. and Gurari, D.",
        TITLE = "VQA Therapy: Exploring Answer Differences by Visually Grounding
Answers",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15269-15279",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237804"}

@inproceedings{bb242890,
        AUTHOR = "Le, T.M. and Le, V. and Gupta, S.I. and Venkatesh, S. and Tran, T.",
        TITLE = "Guiding Visual Question Answering with Attention Priors",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "4370-4379",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237805"}

@inproceedings{bb242891,
        AUTHOR = "Khan, A.U. and Kuehne, H. and Gan, C. and da Vitoria Lobo, N. and Shah, M.",
        TITLE = "Weakly Supervised Grounding for VQA in Vision-Language Transformers",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:652-670",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237806"}

@inproceedings{bb242892,
        AUTHOR = "Gupta, K. and Gautam, D. and Mamidi, R.",
        TITLE = "cViL: Cross-Lingual Training of Vision-Language Models using
Knowledge Distillation",
        BOOKTITLE = "ICPR22",
        YEAR = "2022",
        PAGES = "1734-1741",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237807"}

@inproceedings{bb242893,
        AUTHOR = "Lu, X.P. and Fan, Z. and Wang, Y. and Oh, J. and Rose, C.P.",
        TITLE = "Localize, Group, and Select: Boosting Text-VQA by Scene Text Modeling",
        BOOKTITLE = XSAnim21,
        YEAR = "2021",
        PAGES = "2631-2639",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237808"}

@inproceedings{bb242894,
        AUTHOR = "Khan, A.U. and Kuehne, H. and Duarte, K. and Gan, C. and Lobo, N. and Shah, M.",
        TITLE = "Found a Reason for me? Weakly-supervised Grounded Visual Question
Answering using Capsules",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "8461-8470",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237809"}

@inproceedings{bb242895,
        AUTHOR = "Selvaraju, R.R. and Tendulkar, P. and Parikh, D. and Horvitz, E. and Tulio Ribeiro, M. and Nushi, B. and Kamar, E.",
        TITLE = "SQuINTing at VQA Models: Introspecting VQA Models With Sub-Questions",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10000-10008",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237810"}

@inproceedings{bb242896,
        AUTHOR = "Gouthaman, K.V. and Mittal, A.",
        TITLE = "Reducing Language Biases in Visual Question Answering with
Visually-grounded Question Encoder",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XIII:18-34",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237811"}

@inproceedings{bb242897,
        AUTHOR = "Tan, H.L. and Leong, M.C. and Xu, Q. and Li, L. and Fang, F. and Cheng, Y. and Gauthier, N. and Sun, Y. and Lim, J.H.",
        TITLE = "Task-Oriented Multi-Modal Question Answering For Collaborative
Applications",
        BOOKTITLE = ICIP20,
        YEAR = "2020",
        PAGES = "1426-1430",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237812"}

@inproceedings{bb242898,
        AUTHOR = "Selvaraju, R.R. and Lee, S. and Shen, Y. and Jin, H. and Ghosh, S. and Heck, L. and Batra, D. and Parikh, D.",
        TITLE = "Taking a HINT: Leveraging Explanations to Make Vision and Language
Models More Grounded",
        BOOKTITLE = ICCV19,
        YEAR = "2019",
        PAGES = "2591-2600",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237813"}

@inproceedings{bb242899,
        AUTHOR = "Zhang, Y. and Niebles, J.C. and Soto, A.",
        TITLE = "Interpretable Visual Question Answering by Visual Grounding From
Attention Supervision Mining",
        BOOKTITLE = WACV19,
        YEAR = "2019",
        PAGES = "349-357",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT237814"}

Last update:Apr 23, 2026 at 15:05:02