@inproceedings{bb235700, AUTHOR = "Gonzalez, C. and Ayobi, N. and Hernandez, I. and Hernandez, J. and Pont Tuset, J. and Arbelaez, P.", TITLE = "Panoptic Narrative Grounding", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1344-1353", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230674"} @inproceedings{bb235701, AUTHOR = "Hong, Y. and Li, Q. and Zhu, S.C. and Huang, S.Y.", TITLE = "VLGrammar: Grounded Grammar Induction of Vision and Language", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1645-1654", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230675"} @inproceedings{bb235702, AUTHOR = "Yuan, Z.H. and Yan, X. and Liao, Y.H. and Zhang, R.M. and Wang, S. and Li, Z. and Cui, S.G.", TITLE = "InstanceRefer: Cooperative Holistic Understanding for Visual Grounding on Point Clouds through Instance Multi-level Contextual Referring", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1771-1780", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230676"} @inproceedings{bb235703, AUTHOR = "Soldan, M. and Xu, M.M. and Qu, S. and Tegner, J. and Ghanem, B.", TITLE = "VLG-Net: Video-Language Graph Matching Network for Video Grounding", BOOKTITLE = CVEU21, YEAR = "2021", PAGES = "3217-3227", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230677"} @inproceedings{bb235704, AUTHOR = "Tian, Y.P. and Hu, D. and Xu, C.L.", TITLE = "Cyclic Co-Learning of Sounding Object Visual Grounding and Sound Separation", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "2744-2753", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230678"} @inproceedings{bb235705, AUTHOR = "Nan, G.S. and Qiao, R. and Xiao, Y. and Liu, J. and Leng, S.C. and Zhang, H. and Lu, W.", TITLE = "Interventional Video Grounding with Dual Contrastive Learning", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "2764-2774", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230679"} @inproceedings{bb235706, AUTHOR = "Zhao, Y. and Zhao, Z. and Zhang, Z. and Lin, Z.J.", TITLE = "Cascaded Prediction Network via Segment Tree for Temporal Video Grounding", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "4195-4204", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230680"} @inproceedings{bb235707, AUTHOR = "Liu, Y.F. and Wan, B. and Ma, L. and He, X.M.", TITLE = "Relation-aware Instance Refinement for Weakly Supervised Visual Grounding", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "5608-5617", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230681"} @inproceedings{bb235708, AUTHOR = "Liu, H.L. and Lin, A. and Han, X.G. and Yang, L. and Yu, Y.Z. and Cui, S.G.", TITLE = "Refer-it-in-RGBD: A Bottom-up Approach for 3D Visual Grounding in RGBD Images", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "6028-6037", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230682"} @inproceedings{bb235709, AUTHOR = "Lin, X.R. and Li, G.B. and Yu, Y.Z.", TITLE = "Scene-Intuitive Agent for Remote Embodied Visual Grounding", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "7032-7041", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230683"} @inproceedings{bb235710, AUTHOR = "Liu, D.Z. and Qu, X.Y. and Dong, J.F. and Zhou, P. and Cheng, Y. and Wei, W. and Xu, Z. and Xie, Y.", TITLE = "Context-aware Biaffine Localizing Network for Temporal Sentence Grounding", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "11230-11239", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230684"} @inproceedings{bb235711, AUTHOR = "Meng, Z.H. and Yu, L.C. and Zhang, N. and Berg, T. and Damavandi, B. and Singh, V. and Bearman, A.", TITLE = "Connecting What to Say With Where to Look by Modeling Human Attention Traces", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "12674-12683", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230685"} @inproceedings{bb235712, AUTHOR = "Wang, L.W. and Huang, J. and Li, Y. and Xu, K. and Yang, Z.Y. and Yu, D.", TITLE = "Improving Weakly Supervised Visual Grounding by Contrastive Knowledge Distillation", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "14085-14095", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230686"} @inproceedings{bb235713, AUTHOR = "Huang, B.B. and Lian, D.Z. and Luo, W.X. and Gao, S.H.", TITLE = "Look Before You Leap: Learning Landmark Features for One-Stage Visual Grounding", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "16883-16892", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230687"} @inproceedings{bb235714, AUTHOR = "Zhou, H. and Zhang, C.Y. and Luo, Y. and Chen, Y.J. and Hu, C.P.", TITLE = "Embracing Uncertainty: Decoupling and De-bias for Robust Temporal Grounding", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "8441-8450", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230688"} @inproceedings{bb235715, AUTHOR = "Zhang, S.Y. and Jiang, T. and Wang, T. and Kuang, K. and Zhao, Z. and Zhu, J. and Yu, J. and Yang, H.X. and Wu, F.", TITLE = "DeVLBert: Out-of-distribution Visio-Linguistic Pretraining with Causality", BOOKTITLE = CiV21, YEAR = "2021", PAGES = "1744-1747", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230689"} @inproceedings{bb235716, AUTHOR = "Nguyen, A.T. and Richards, L.E. and Kebe, G.Y. and Raff, E. and Darvish, K. and Ferraro, F. and Matuszek, C.", TITLE = "Practical Cross-modal Manifold Alignment for Robotic Grounded Language Learning", BOOKTITLE = MULA21, YEAR = "2021", PAGES = "1613-1622", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230690"} @inproceedings{bb235717, AUTHOR = "Shrestha, A. and Pugdeethosapol, K. and Fang, H.W. and Qiu, Q.R.", TITLE = "MAGNet: Multi-Region Attention-Assisted Grounding of Natural Language Queries at Phrase Level", BOOKTITLE = ICPR21, YEAR = "2021", PAGES = "8275-8282", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230691"} @inproceedings{bb235718, AUTHOR = "Zhang, Z. and Zhao, Z. and Zhao, Y. and Wang, Q. and Liu, H. and Gao, L.", TITLE = "Where Does It Exist: Spatio-Temporal Video Grounding for Multi-Form Sentences", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "10665-10674", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230692"} @inproceedings{bb235719, AUTHOR = "Sadhu, A. and Chen, K. and Nevatia, R.", TITLE = "Video Object Grounding Using Semantic Roles in Language Description", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "10414-10424", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230693"} @inproceedings{bb235720, AUTHOR = "Ma, C.Y. and Kalantidis, Y. and AlRegib, G. and Vajda, P. and Rohrbach, M. and Kira, Z.", TITLE = "Learning to Generate Grounded Visual Captions Without Localization Supervision", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "XVIII:353-370", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230694"} @inproceedings{bb235721, AUTHOR = "Zeng, R.H. and Xu, H.M. and Huang, W.B. and Chen, P.H. and Tan, M.K. and Gan, C.", TITLE = "Dense Regression Network for Video Grounding", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "10284-10293", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230695"} @inproceedings{bb235722, AUTHOR = "Gupta, T. and Vahdat, A. and Chechik, G. and Yang, X.D. and Kautz, J. and Hoiem, D.", TITLE = "Contrastive Learning for Weakly Supervised Phrase Grounding", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "III:752-768", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230696"} @inproceedings{bb235723, AUTHOR = "Yang, S. and Li, G.B. and Yu, Y.Z.", TITLE = "Propagating Over Phrase Relations for One-stage Visual Grounding", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "XIX:589-605", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230697"} @inproceedings{bb235724, AUTHOR = "Xiao, J.B. and Shang, X. and Yang, X. and Tang, S. and Chua, T.S.", TITLE = "Visual Relation Grounding in Videos", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "VI:447-464", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230698"} @inproceedings{bb235725, AUTHOR = "Mun, J. and Cho, M. and Han, B.", TITLE = "Local-Global Video-Text Interactions for Temporal Grounding", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "10807-10816", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230699"} @inproceedings{bb235726, AUTHOR = "Wu, C. and Lin, Z. and Cohen, S. and Bui, T. and Maji, S.", TITLE = "PhraseCut: Language-Based Image Segmentation in the Wild", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "10213-10222", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230700"} @inproceedings{bb235727, AUTHOR = "Chen, L. and Zhai, M.Y. and He, J.W. and Mori, G.", TITLE = "Object Grounding via Iterative Context Reasoning", BOOKTITLE = MDALC19, YEAR = "2019", PAGES = "1407-1415", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230701"} @inproceedings{bb235728, AUTHOR = "Sinha, A. and Akilesh, B. and Sarkar, M. and Krishnamurthy, B.", TITLE = "Attention Based Natural Language Grounding by Navigating Virtual Environment", BOOKTITLE = WACV19, YEAR = "2019", PAGES = "236-244", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230702"} @inproceedings{bb235729, AUTHOR = "Shi, J. and Xu, J. and Gong, B.Q. and Xu, C.L.", TITLE = "Not All Frames Are Equal: Weakly-Supervised Video Grounding With Contextual Similarity and Visual Clustering Losses", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "10436-10444", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230703"} @inproceedings{bb235730, AUTHOR = "Datta, S. and Sikka, K. and Roy, A. and Ahuja, K. and Parikh, D. and Divakaran, A.", TITLE = "Align2Ground: Weakly Supervised Phrase Grounding Guided by Image-Caption Alignment", BOOKTITLE = ICCV19, YEAR = "2019", PAGES = "2601-2610", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230704"} @inproceedings{bb235731, AUTHOR = "Fang, Z.Y. and Kong, S. and Fowlkes, C.C. and Yang, Y.Z.", TITLE = "Modularized Textual Grounding for Counterfactual Resilience", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "6371-6381", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230705"} @inproceedings{bb235732, AUTHOR = "Zhuang, B. and Wu, Q. and Shen, C. and Reid, I.D. and van den Hengel, A.J.", TITLE = "Parallel Attention: A Unified Framework for Visual Object Discovery Through Dialogs and Queries", BOOKTITLE = CVPR18, YEAR = "2018", PAGES = "4252-4261", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230706"} @inproceedings{bb235733, AUTHOR = "Yang, Z.Y. and Chen, T.L. and Wang, L.W. and Luo, J.B.", TITLE = "Improving One-Stage Visual Grounding by Recursive Sub-query Construction", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "XIV:387-404", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230707"} @inproceedings{bb235734, AUTHOR = "Liu, D.Q. and Zhang, H.W. and Zha, Z.J. and Wu, F.", TITLE = "Learning to Assemble Neural Module Tree Networks for Visual Grounding", BOOKTITLE = ICCV19, YEAR = "2019", PAGES = "4672-4681", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230708"} @inproceedings{bb235735, AUTHOR = "Sadhu, A. and Chen, K. and Nevatia, R.", TITLE = "Zero-Shot Grounding of Objects From Natural Language Queries", BOOKTITLE = ICCV19, YEAR = "2019", PAGES = "4693-4702", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230709"} @inproceedings{bb235736, AUTHOR = "Yang, Z.Y. and Gong, B.Q. and Wang, L.W. and Huang, W.B. and Yu, D. and Luo, J.B.", TITLE = "A Fast and Accurate One-Stage Approach to Visual Grounding", BOOKTITLE = ICCV19, YEAR = "2019", PAGES = "4682-4692", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230710"} @inproceedings{bb235737, AUTHOR = "Rohrbach, A. and Rohrbach, M. and Tang, S. and Oh, S.J. and Schiele, B.", TITLE = "Generating Descriptions with Grounded and Co-referenced People", BOOKTITLE = CVPR17, YEAR = "2017", PAGES = "4196-4206", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230711"} @inproceedings{bb235738, AUTHOR = "Zhu, Y. and Kiros, R. and Zemel, R. and Salakhutdinov, R. and Urtasun, R. and Torralba, A.B. and Fidler, S.", TITLE = "Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books", BOOKTITLE = ICCV15, YEAR = "2015", PAGES = "19-27", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT230712"} @article{bb235739, AUTHOR = "Chen, Z.X. and Bie, Y. and Jin, H.B. and Chen, H.", TITLE = "Large Language Model With Region-Guided Referring and Grounding for CT Report Generation", JOURNAL = MedImg, VOLUME = "44", YEAR = "2025", NUMBER = "8", MONTH = "August", PAGES = "3139-3150", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230713"} @article{bb235740, AUTHOR = "Liu, Y. and Hou, H.W. and Ma, F. and Ni, S.G. and Yu, F.R.", TITLE = "MLLM-TA: Leveraging Multimodal Large Language Models for Precise Temporal Video Grounding\\", JOURNAL = SPLetters, VOLUME = "32", YEAR = "2025", PAGES = "281-285", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230714"} @article{bb235741, AUTHOR = "Li, G.Z. and Ding, X.P. and Cheng, D. and Li, J. and Wang, N.N. and Gao, X.B.", TITLE = "ETC: Temporal Boundary Expand Then Clarify for Weakly Supervised Video Grounding With Multimodal Large Language Model", JOURNAL = MultMed, VOLUME = "27", YEAR = "2025", PAGES = "1772-1782", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230715"} @inproceedings{bb235742, AUTHOR = "Gao, J. and Li, Y.Q. and Cao, Z.Q. and Li, W.J.", TITLE = "Interleaved-Modal Chain-of-Thought", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19520-19529", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230716"} @inproceedings{bb235743, AUTHOR = "Yu, C.L. and Wang, H.Q. and Shi, Y. and Luo, H.Y. and Yang, S. and Yu, J.Y. and Wang, J.Y.", TITLE = "SeqAfford: Sequential 3D Affordance Reasoning via Multimodal Large Language Model", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "1691-1701", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230717"} @inproceedings{bb235744, AUTHOR = "Huang, Y. and Gao, T.Y. and Xu, H.R. and Zhao, Q.H. and Song, Y. and Gui, Z.P. and Lv, T.C. and Chen, H. and Cui, L. and Li, S. and Wei, F.", TITLE = "PEACE: Empowering Geologic Map Holistic Understanding with MLLMs", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "3899-3908", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230718"} @inproceedings{bb235745, AUTHOR = "Chen, W.B. and Xu, Z. and Xu, R. and Wu, S. and Wong, H.S.", TITLE = "Task-aware Cross-modal Feature Refinement Transformer with Large Language Models for Visual Grounding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "3931-3941", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230719"} @inproceedings{bb235746, AUTHOR = "Wu, S. and Jin, S. and Zhang, W.W. and Xu, L. and Liu, W.T. and Li, W. and Loy, C.C.", TITLE = "F-LMM: Grounding Frozen Large Multimodal Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24710-24721", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230720"} @inproceedings{bb235747, AUTHOR = "Qian, R. and Yin, X. and Dou, D.", TITLE = "Reasoning to Attend: Try to Understand HowToken Works", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24722-24731", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230721"} @inproceedings{bb235748, AUTHOR = "Chen, Y. and Xu, D. and Huang, Y. and Zhan, S.K. and Wang, H. and Chen, D.X. and Wang, X.P. and Qiu, M. and Li, H.", TITLE = "MIMO: A medical vision language model with visual referring multimodal input and pixel grounding multimodal output", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24732-24741", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230722"} @inproceedings{bb235749, AUTHOR = "Huang, H.F. and Chen, X. and Chen, Y.L. and Li, H. and Han, X. and Wang, Z. and Wang, T. and Pang, J.M. and Zhao, Z.", TITLE = "RoboGround: Robotic Manipulation with Grounded Vision-Language Priors", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "22540-22550", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230723"} @inproceedings{bb235750, AUTHOR = "Man, Y.Z. and Huang, D.A. and Liu, G.L. and Sheng, S.W. and Liu, S.L. and Gui, L.Y. and Kautz, J. and Wang, Y.X. and Yu, Z.", TITLE = "Argus: Vision-Centric Reasoning with Grounded Chain-of-Thought", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14268-14280", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230724"} @inproceedings{bb235751, AUTHOR = "Yin, H. and Ren, Y.Q. and Yan, K. and Ding, S.H. and Hao, Y.T.", TITLE = "ROD-MLLM: Towards More Reliable Object Detection in Multimodal Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14358-14368", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230725"} @inproceedings{bb235752, AUTHOR = "Liao, Y.H. and Mahmood, R. and Fidler, S. and Acuna, D.", TITLE = "Can Large Vision-Language Models Correct Semantic Grounding Errors By Themselves?", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14667-14678", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230726"} @inproceedings{bb235753, AUTHOR = "Yuan, Z.H. and Peng, Y. and Ren, J. and Liao, Y.H. and Han, Y. and Feng, C.M. and Zhao, H.S. and Li, G.B. and Cui, S.G. and Li, Z.", TITLE = "Empowering Large Language Models with 3D Situation Awareness", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19435-19445", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230727"} @inproceedings{bb235754, AUTHOR = "Kang, S. and Kim, J. and Kim, J. and Hwang, S.J.", TITLE = "Your Large Vision-Language Model Only Needs A Few Attention Heads For Visual Grounding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "9339-9350", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230728"} @inproceedings{bb235755, AUTHOR = "Liu, Q.Y. and Zhang, S.Q. and Qiao, Y. and Zhu, J. and Li, X. and Guo, L. and Wang, Q. and He, X.J. and Wu, Q. and Liu, J.", TITLE = "GroundingMate: Aiding Object Grounding for Goal-Oriented Vision-and-Language Navigation", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "1775-1784", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230729"} @inproceedings{bb235756, AUTHOR = "Yan, S. and Bai, M. and Chen, W.F. and Zhou, X. and Huang, Q.X. and Li, L.E.", TITLE = "Vigor: Improving Visual Grounding of Large Vision Language Models with Fine-grained Reward Modeling", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXI: 37-53", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230730"} @inproceedings{bb235757, AUTHOR = "Chowdhury, S. and Nag, S. and Dasgupta, S. and Chen, J. and Elhoseiny, M. and Gao, R.H. and Manocha, D.", TITLE = "Meerkat: Audio-visual Large Language Model for Grounding in Space and Time", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXIV: 52-70", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230731"} @inproceedings{bb235758, AUTHOR = "Kuckreja, K. and Danish, M.S. and Naseer, M. and Das, A. and Khan, S. and Khan, F.S.", TITLE = "GeoChat: Grounded Large Vision-Language Model for Remote Sensing", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27831-27840", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230732"} @inproceedings{bb235759, AUTHOR = "Song, C.H. and Sadler, B.M. and Wu, J. and Chao, W.L. and Washington, C. and Su, Y.", TITLE = "LLM-Planner: Few-Shot Grounded Planning for Embodied Agents with Large Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2986-2997", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230733"} @inproceedings{bb235760, AUTHOR = "You, K. and Zhang, H.T. and Schoop, E. and Weers, F. and Swearngin, A. and Nichols, J. and Yang, Y.F. and Gan, Z.", TITLE = "FERRET-UI: Grounded Mobile UI Understanding with Multimodal LLMs", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXIV: 240-255", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230734"} @inproceedings{bb235761, AUTHOR = "Tong, S.B. and Liu, Z. and Zhai, Y.X. and Ma, Y. and LeCun, Y. and Xie, S.", TITLE = "Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "9568-9578", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230735"} @inproceedings{bb235762, AUTHOR = "Xu, J.R. and Zhou, X.Y. and Yan, S. and Gu, X. and Arnab, A. and Sun, C. and Wang, X.L. and Schmid, C.", TITLE = "Pixel Aligned Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13030-13039", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230736"} @inproceedings{bb235763, AUTHOR = "Wu, P.H. and Xie, S.", TITLE = "V*: Guided Visual Search as a Core Mechanism in Multimodal LLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13084-13094", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230737"} @inproceedings{bb235764, AUTHOR = "He, R. and Cascante Bonilla, P. and Yang, Z.Y. and Berg, A.C. and Ordonez, V.", TITLE = "Improved Visual Grounding through Self-Consistent Explanations", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13095-13105", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230738"} @inproceedings{bb235765, AUTHOR = "Feng, C. and Hsu, J. and Liu, W.Y. and Wu, J.J.", TITLE = "Naturally Supervised 3D Visual Grounding with Language-Regularized Concept Learners", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13269-13278", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230739"} @inproceedings{bb235766, AUTHOR = "He, J.W. and Wang, Y.F. and Wang, L.J. and Lu, H.C. and He, J.Y. and Lan, J.P. and Luo, B. and Xie, X.", TITLE = "Multi-Modal Instruction Tuned LLMs with Fine-Grained Visual Perception", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13980-13990", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230740"} @inproceedings{bb235767, AUTHOR = "Huang, B. and Wang, X. and Chen, H. and Song, Z. and Zhu, W.W.", TITLE = "VTimeLLM: Empower LLM to Grasp Video Moments", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14271-14280", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230741"} @inproceedings{bb235768, AUTHOR = "Yuan, Z.H. and Ren, J. and Feng, C.M. and Zhao, H.S. and Cui, S.G. and Li, Z.", TITLE = "Visual Programming for Zero-Shot Open-Vocabulary 3D Visual Grounding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "20623-20633", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230742"} @inproceedings{bb235769, AUTHOR = "Chen, G. and Shen, L. and Shao, R. and Deng, X. and Nie, L.Q.", TITLE = "LION: Empowering Multimodal Large Language Model with Dual-Level Visual Knowledge", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26530-26540", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230743"} @inproceedings{bb235770, AUTHOR = "Qu, M.X. and Chen, X.D. and Liu, W. and Li, A. and Zhao, Y.", TITLE = "ChatVTG: Video Temporal Grounding via Chat with Video Dialogue Large Language Models", BOOKTITLE = PVUW24, YEAR = "2024", PAGES = "1847-1856", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230744"} @inproceedings{bb235771, AUTHOR = "Zhang, Y. and Ma, Z.Q. and Gao, X.F. and Shakiah, S. and Gao, Q. and Chai, J.", TITLE = "Groundhog Grounding Large Language Models to Holistic Segmentation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14227-14238", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230745"} @inproceedings{bb235772, AUTHOR = "Kim, K. and Yoon, K. and Jeon, J. and In, Y. and Moon, J. and Kim, D.H. and Park, C.", TITLE = "LLM4SGG: Large Language Models for Weakly Supervised Scene Graph Generation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "28306-28316", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT230746"} @article{bb235773, AUTHOR = "Liang, J.W. and Jiang, L. and Cao, L.L. and Kalantidis, Y. and Li, L.J. and Hauptmann, A.G.", TITLE = "Focal Visual-Text Attention for Memex Question Answering", JOURNAL = PAMI, VOLUME = "41", YEAR = "2019", NUMBER = "8", MONTH = "August", PAGES = "1893-1908", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230748"} @inproceedings{bb235774, AUTHOR = "Liang, J.W. and Jiang, L. and Cao, L.L. and Li, L.J. and Hauptmann, A.G.", TITLE = "Focal Visual-Text Attention for Visual Question Answering", BOOKTITLE = CVPR18, YEAR = "2018", PAGES = "6135-6143", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230749"} @article{bb235775, AUTHOR = "Riquelme, F. and de Goyeneche, A. and Zhang, Y.D. and Niebles, J.C. and Soto, A.", TITLE = "Explaining VQA predictions using visual grounding and a knowledge base", JOURNAL = IVC, VOLUME = "101", YEAR = "2020", PAGES = "103968", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230750"} @article{bb235776, AUTHOR = "Zhao, L.C. and Cai, D.G. and Zhang, J. and Sheng, L. and Xu, D. and Zheng, R. and Zhao, Y.J. and Wang, L.P. and Fan, X.", TITLE = "Toward Explainable 3D Grounded Visual Question Answering: A New Benchmark and Strong Baseline", JOURNAL = CirSysVideo, VOLUME = "33", YEAR = "2023", NUMBER = "6", MONTH = "June", PAGES = "2935-2949", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230751"} @article{bb235777, AUTHOR = "Zhu, L.J. and Peng, L. and Zhou, W.N. and Yang, J.L.", TITLE = "Dual-decoder transformer network for answer grounding in visual question answering", JOURNAL = PRL, VOLUME = "171", YEAR = "2023", PAGES = "53-60", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230752"} @inproceedings{bb235778, AUTHOR = "Huang, J.Y. and Jia, B.X. and Wang, Y. and Zhu, Z.Y. and Linghu, X.K. and Li, Q. and Zhu, S.C. and Huang, S.Y.", TITLE = "Unveiling the Mist over 3D Vision-Language Understanding: Object-centric Evaluation with Chain-of-Analysis", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24570-24581", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230753"} @inproceedings{bb235779, AUTHOR = "Chen, K. and Wu, X.Q.", TITLE = "VTQA: Visual Text Question Answering via Entity Alignment and Cross-Media Reasoning", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27208-27217", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230754"} @inproceedings{bb235780, AUTHOR = "Di, S.Z. and Xie, W.", TITLE = "Grounded Question-Answering in Long Egocentric Videos", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "12934-12943", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230755"} @inproceedings{bb235781, AUTHOR = "Chen, C.Y. and Anjum, S. and Gurari, D.", TITLE = "VQA Therapy: Exploring Answer Differences by Visually Grounding Answers", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15269-15279", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230756"} @inproceedings{bb235782, AUTHOR = "Le, T.M. and Le, V. and Gupta, S.I. and Venkatesh, S. and Tran, T.", TITLE = "Guiding Visual Question Answering with Attention Priors", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "4370-4379", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230757"} @inproceedings{bb235783, AUTHOR = "Khan, A.U. and Kuehne, H. and Gan, C. and da Vitoria Lobo, N. and Shah, M.", TITLE = "Weakly Supervised Grounding for VQA in Vision-Language Transformers", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXV:652-670", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230758"} @inproceedings{bb235784, AUTHOR = "Gupta, K. and Gautam, D. and Mamidi, R.", TITLE = "cViL: Cross-Lingual Training of Vision-Language Models using Knowledge Distillation", BOOKTITLE = "ICPR22", YEAR = "2022", PAGES = "1734-1741", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230759"} @inproceedings{bb235785, AUTHOR = "Li, Y.C. and Wang, X. and Xiao, J.B. and Ji, W. and Chua, T.S.", TITLE = "Invariant Grounding for Video Question Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "2918-2927", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230760"} @inproceedings{bb235786, AUTHOR = "Lu, X.P. and Fan, Z. and Wang, Y. and Oh, J. and Rose, C.P.", TITLE = "Localize, Group, and Select: Boosting Text-VQA by Scene Text Modeling", BOOKTITLE = XSAnim21, YEAR = "2021", PAGES = "2631-2639", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230761"} @inproceedings{bb235787, AUTHOR = "Khan, A.U. and Kuehne, H. and Duarte, K. and Gan, C. and Lobo, N. and Shah, M.", TITLE = "Found a Reason for me? Weakly-supervised Grounded Visual Question Answering using Capsules", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "8461-8470", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230762"} @inproceedings{bb235788, AUTHOR = "Selvaraju, R.R. and Tendulkar, P. and Parikh, D. and Horvitz, E. and Tulio Ribeiro, M. and Nushi, B. and Kamar, E.", TITLE = "SQuINTing at VQA Models: Introspecting VQA Models With Sub-Questions", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "10000-10008", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230763"} @inproceedings{bb235789, AUTHOR = "Gouthaman, K.V. and Mittal, A.", TITLE = "Reducing Language Biases in Visual Question Answering with Visually-grounded Question Encoder", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "XIII:18-34", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230764"} @inproceedings{bb235790, AUTHOR = "Tan, H.L. and Leong, M.C. and Xu, Q. and Li, L. and Fang, F. and Cheng, Y. and Gauthier, N. and Sun, Y. and Lim, J.H.", TITLE = "Task-Oriented Multi-Modal Question Answering For Collaborative Applications", BOOKTITLE = ICIP20, YEAR = "2020", PAGES = "1426-1430", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230765"} @inproceedings{bb235791, AUTHOR = "Selvaraju, R.R. and Lee, S. and Shen, Y. and Jin, H. and Ghosh, S. and Heck, L. and Batra, D. and Parikh, D.", TITLE = "Taking a HINT: Leveraging Explanations to Make Vision and Language Models More Grounded", BOOKTITLE = ICCV19, YEAR = "2019", PAGES = "2591-2600", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230766"} @inproceedings{bb235792, AUTHOR = "Zhang, Y. and Niebles, J.C. and Soto, A.", TITLE = "Interpretable Visual Question Answering by Visual Grounding From Attention Supervision Mining", BOOKTITLE = WACV19, YEAR = "2019", PAGES = "349-357", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT230767"} @article{bb235793, AUTHOR = "Li, X. and Jiang, S.", TITLE = "Bundled Object Context for Referring Expressions", JOURNAL = MultMed, VOLUME = "20", YEAR = "2018", NUMBER = "10", MONTH = "October", PAGES = "2749-2760", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT230768"} @article{bb235794, AUTHOR = "Wang, J.M. and Cui, E. and Liu, K.L. and Sun, Y.K. and Liang, J.Y. and Yuan, C.M. and Duan, X.J. and Jin, G.H. and Chung, T.S.", TITLE = "Referring expression comprehension model with matching detection and linguistic feedback", JOURNAL = IET-CV, VOLUME = "14", YEAR = "2020", NUMBER = "8", MONTH = "December", PAGES = "625-633", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT230769"} @article{bb235795, AUTHOR = "Qiao, Y.Y. and Deng, C.R. and Wu, Q.", TITLE = "Referring Expression Comprehension: A Survey of Methods and Datasets", JOURNAL = MultMed, VOLUME = "23", YEAR = "2021", PAGES = "4426-4440", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT230770"} @article{bb235796, AUTHOR = "Niu, Y.L. and Zhang, H.W. and Lu, Z.W. and Chang, S.F.", TITLE = "Variational Context: Exploiting Visual and Textual Context for Grounding Referring Expressions", JOURNAL = PAMI, VOLUME = "43", YEAR = "2021", NUMBER = "1", MONTH = "January", PAGES = "347-359", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT230771"} @article{bb235797, AUTHOR = "Yang, S. and Li, G.B. and Yu, Y.Z.", TITLE = "Relationship-Embedded Representation Learning for Grounding Referring Expressions", JOURNAL = PAMI, VOLUME = "43", YEAR = "2021", NUMBER = "8", MONTH = "August", PAGES = "2765-2779", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT230772"} @inproceedings{bb235798, AUTHOR = "Yang, S. and Li, G.B. and Yu, Y.Z.", TITLE = "Cross-Modal Relationship Inference for Grounding Referring Expressions", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "4140-4149", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT230773"} @article{bb235799, AUTHOR = "Sun, M.J. and Xiao, J. and Lim, E.G. and Liu, S. and Goulermas, J.Y.", TITLE = "Discriminative Triad Matching and Reconstruction for Weakly Referring Expression Grounding", JOURNAL = PAMI, VOLUME = "43", YEAR = "2021", NUMBER = "11", MONTH = "November", PAGES = "4189-4195", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT230774"}