@article{bb228500,
        AUTHOR = "Xu, Z. and Wei, K. and Yang, X. and Deng, C.",
        TITLE = "Point-Supervised Video Temporal Grounding",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "6121-6131",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223499"}

@article{bb228501,
        AUTHOR = "Luo, F. and Chen, S.X. and Chen, J.J. and Wu, Z.X. and Jiang, Y.G.",
        TITLE = "Self-Supervised Learning for Semi-Supervised Temporal Language
Grounding",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "7747-7757",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223500"}

@article{bb228502,
        AUTHOR = "Liu, D.Z. and Fang, X. and Hu, W. and Zhou, P.",
        TITLE = "Exploring Optical-Flow-Guided Motion and Detection-Based Appearance
for Temporal Sentence Grounding",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "8539-8553",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223501"}

@article{bb228503,
        AUTHOR = "Yang, X.F. and Liu, F. and Lin, G.S.",
        TITLE = "Effective End-to-End Vision Language Pretraining With Semantic Visual
Loss",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "8408-8417",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223502"}

@article{bb228504,
        AUTHOR = "Ma, G.Q. and Bai, Y. and Zhang, W. and Yao, T. and Shihada, B. and Mei, T.",
        TITLE = "Boosting Generic Visual-Linguistic Representation With Dynamic
Contexts",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "8445-8457",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223503"}

@article{bb228505,
        AUTHOR = "Su, C. and Li, Z. and Lei, T.Y. and Peng, D.Z. and Wang, X.",
        TITLE = "MetaVG: A Meta-Learning Framework for Visual Grounding",
        JOURNAL = SPLetters,
        VOLUME = "31",
        YEAR = "2024",
        PAGES = "236-240",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223504"}

@article{bb228506,
        AUTHOR = "Zeng, Y.W. and Han, N. and Pan, K.Y. and Jin, Q.",
        TITLE = "Temporally Language Grounding With Multi-Modal Multi-Prompt Tuning",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "3366-3377",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223505"}

@article{bb228507,
        AUTHOR = "Fang, X. and Liu, D. and Zhou, P. and Xu, Z. and Li, R.X.",
        TITLE = "Hierarchical Local-Global Transformer for Temporal Sentence Grounding",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "3263-3277",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223506"}

@article{bb228508,
        AUTHOR = "Wang, Z.Y. and Yang, C. and Jiang, B. and Yuan, J.S.",
        TITLE = "A Dual Reinforcement Learning Framework for Weakly Supervised Phrase
Grounding",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "394-405",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223507"}

@article{bb228509,
        AUTHOR = "Lu, Y. and Quan, R.J. and Zhu, L.C. and Yang, Y.",
        TITLE = "Zero-Shot Video Grounding With Pseudo Query Lookup and Verification",
        JOURNAL = IP,
        VOLUME = "33",
        YEAR = "2024",
        PAGES = "1643-1654",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223508"}

@article{bb228510,
        AUTHOR = "Wang, W.K. and Su, Y.T. and Liu, J. and Jing, P.G.",
        TITLE = "Adaptive proposal network based on generative adversarial learning
for weakly supervised temporal sentence grounding",
        JOURNAL = PRL,
        VOLUME = "179",
        YEAR = "2024",
        PAGES = "9-16",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223509"}

@article{bb228511,
        AUTHOR = "Liu, M. and Zhou, D. and Guo, J. and Luo, X. and Gao, Z. and Nie, L.Q.",
        TITLE = "Semantic-Aware Contrastive Learning With Proposal Suppression for
Video Semantic Role Grounding",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "3003-3016",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223510"}

@article{bb228512,
        AUTHOR = "Tang, W. and Li, L. and Liu, X.J. and Jin, L. and Tang, J.H. and Li, Z.C.",
        TITLE = "Context Disentangling and Prototype Inheriting for Robust Visual
Grounding",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "3213-3229",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223511"}

@article{bb228513,
        AUTHOR = "Shi, F.Y. and Huang, W.L. and Wang, L.M.",
        TITLE = "End-to-end dense video grounding via parallel regression",
        JOURNAL = CVIU,
        VOLUME = "242",
        YEAR = "2024",
        PAGES = "103980",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223512"}

@article{bb228514,
        AUTHOR = "Shao, R. and Wu, T.X. and Wu, J.L. and Nie, L.Q. and Liu, Z.W.",
        TITLE = "Detecting and Grounding Multi-Modal Media Manipulation and Beyond",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "5556-5574",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223513"}

@inproceedings{bb228515,
        AUTHOR = "Shao, R. and Wu, T.X. and Liu, Z.W.",
        TITLE = "Detecting and Grounding Multi-Modal Media Manipulation",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6904-6913",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223514"}

@article{bb228516,
        AUTHOR = "Chen, L. and Deng, Z. and Liu, L. and Yin, S.",
        TITLE = "Multilevel Semantic Interaction Alignment for Video-Text Cross-Modal
Retrieval",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "7",
        MONTH = "July",
        PAGES = "6559-6575",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223515"}

@article{bb228517,
        AUTHOR = "Zhang, T. and Lu, X.K. and Zhang, H. and Nie, X.S. and Yin, Y.L. and Shen, J.B.",
        TITLE = "Relational Network via Cascade CRF for Video Language Grounding",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "8297-8311",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223516"}

@article{bb228518,
        AUTHOR = "Wu, Q.Q. and Guo, L.J. and Zhang, R. and Qian, J.B. and Gao, S.",
        TITLE = "QSMT-net: A query-sensitive proposal and multi-temporal-span matching
network for video grounding",
        JOURNAL = IVC,
        VOLUME = "149",
        YEAR = "2024",
        PAGES = "105188",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223517"}

@article{bb228519,
        AUTHOR = "Yao, H.B. and Wang, L.P. and Cai, C.T. and Wang, W. and Zhang, Z. and Shang, X.B.",
        TITLE = "Language conditioned multi-scale visual attention networks for visual
grounding",
        JOURNAL = IVC,
        VOLUME = "150",
        YEAR = "2024",
        PAGES = "105242",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223518"}

@article{bb228520,
        AUTHOR = "Wu, W. and Cao, M. and Hu, Y. and Peng, Y. and Qin, L. and Yin, Q.",
        TITLE = "Visual Grounding With Dual Knowledge Distillation",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "10399-10410",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223519"}

@article{bb228521,
        AUTHOR = "Li, S.T. and Li, B. and Sun, B. and Weng, Y.X.",
        TITLE = "Towards Visual-Prompt Temporal Answer Grounding in Instructional
Video",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "8836-8853",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223520"}

@inproceedings{bb228522,
        AUTHOR = "Fang, X. and Xiong, Z. and Fang, W.L. and Qu, X.Y. and Chen, C. and Dongd, J.F. and Tang, K. and Zhou, P. and Cheng, Y. and Liu, D.Z.",
        TITLE = "Rethinking Weakly-supervised Video Temporal Grounding From a Game
Perspective",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLV: 290-311",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223521"}

@article{bb228523,
        AUTHOR = "Xiong, Z. and Liu, D.Z. and Fang, X. and Qu, X.Y. and Dong, J.F. and Zhu, J.H. and Tang, K. and Zhou, P.",
        TITLE = "Rethinking Video Sentence Grounding from a Tracking Perspective With
Memory Network and Masked Attention",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "11204-11218",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223522"}

@article{bb228524,
        AUTHOR = "Qi, Z.B. and Yuan, Y. and Ruan, X.W. and Wang, S.H. and Zhang, W.G. and Huang, Q.M.",
        TITLE = "Collaborative Debias Strategy for Temporal Sentence Grounding in
Video",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "10972-10986",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223523"}

@article{bb228525,
        AUTHOR = "Zhou, S. and Zhang, F. and Wang, R.M. and Zhou, F. and Su, Z.",
        TITLE = "Subtask Prior-Driven Optimized Mechanism on Joint Video Moment
Retrieval and Highlight Detection",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "11271-11285",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223524"}

@article{bb228526,
        AUTHOR = "Ji, Z. and Wu, J. and Wang, Y. and Yang, A. and Han, J.G.",
        TITLE = "Progressive Semantic Reconstruction Network for Weakly Supervised
Referring Expression Grounding",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "13058-13070",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223525"}

@article{bb228527,
        AUTHOR = "Dong, J.X. and Yin, Z.Z.",
        TITLE = "Graph-based Dense Event Grounding with relative positional encoding",
        JOURNAL = CVIU,
        VOLUME = "251",
        YEAR = "2025",
        PAGES = "104257",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223526"}

@article{bb228528,
        AUTHOR = "Tang, K.F. and He, L.H. and Wang, N.N. and Gao, X.B.",
        TITLE = "Dual Semantic Reconstruction Network for Weakly Supervised Temporal
Sentence Grounding",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "95-107",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223527"}

@article{bb228529,
        AUTHOR = "Wang, C.L. and Feng, W.Q. and Lyu, S.C. and Cheng, G.L. and Li, X.T. and Liu, B.H. and Zhao, Q.",
        TITLE = "A Masked Reference Token Supervision-Based Iterative Visual-Language
Framework for Robust Visual Grounding",
        JOURNAL = CirSysVideo,
        VOLUME = "35",
        YEAR = "2025",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "75-90",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223528"}

@inproceedings{bb228530,
        AUTHOR = "Li, M.H. and Wang, C.L. and Feng, W.Q. and Lyu, S.C. and Cheng, G.L. and Li, X.T. and Liu, B. and Zhao, Q.",
        TITLE = "Iterative Robust Visual Grounding with Masked Reference based
Centerpoint Supervision",
        BOOKTITLE = VLAR23,
        YEAR = "2023",
        PAGES = "4653-4658",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223529"}

@article{bb228531,
        AUTHOR = "Liu, H. and Tan, Z.C. and Chen, Q. and Wei, Y.C. and Zhao, Y. and Wang, J.D.",
        TITLE = "Unified Frequency-Assisted Transformer Framework for Detecting and
Grounding Multi-modal Manipulation",
        JOURNAL = IJCV,
        VOLUME = "133",
        YEAR = "2025",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "1392-1409",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223530"}

@article{bb228532,
        AUTHOR = "Ding, Y. and Wang, D. and Li, K. and Zhao, X.H. and Wang, Y.F.",
        TITLE = "Visual grounding of remote sensing images with multi-dimensional
semantic-guidance",
        JOURNAL = PRL,
        VOLUME = "189",
        YEAR = "2025",
        PAGES = "85-91",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223531"}

@article{bb228533,
        AUTHOR = "Li, T.Y. and Wang, C. and Tian, S. and Zhang, B. and Wu, F. and Tang, Y.X. and Zhang, H.",
        TITLE = "TACMT: Text-aware cross-modal transformer for visual grounding on
high-resolution SAR images",
        JOURNAL = PandRS,
        VOLUME = "222",
        YEAR = "2025",
        PAGES = "152-166",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223532"}

@inproceedings{bb228534,
        AUTHOR = "Singhi, N. and Kim, J.M. and Roth, K. and Akata, Z.",
        TITLE = "Improving Intervention Efficacy via Concept Realignment in Concept
Bottleneck Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXVI: 422-438",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223533"}

@inproceedings{bb228535,
        AUTHOR = "Li, X. and Qiu, K. and Wang, J.L. and Xu, X.H. and Singh, R. and Yamazaki, K. and Chen, H. and Huang, X.N. and Raj, B.",
        TITLE = "R^2-Bench: Benchmarking the Robustness of Referring Perception Models
Under Perturbations",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "IX: 211-230",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223534"}

@inproceedings{bb228536,
        AUTHOR = "Lee, P. and Byun, H.R.",
        TITLE = "BAM-DETR: Boundary-aligned Moment Detection Transformer for Temporal
Sentence Grounding in Videos",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "II: 220-238",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223535"}

@inproceedings{bb228537,
        AUTHOR = "Ma, C. and Jiang, Y. and Wu, J.N. and Yuan, Z.H. and Qi, X.J.",
        TITLE = "GROMA: Localized Visual Tokenization for Grounding Multimodal Large
Language Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "VI: 417-435",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223536"}

@inproceedings{bb228538,
        AUTHOR = "Huang, Z. and Satoh, S.",
        TITLE = "LOA-TRANS: Enhancing Visual Grounding by Location-aware Transformers",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "VII: 405-421",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223537"}

@inproceedings{bb228539,
        AUTHOR = "Zhu, C. and Wang, T. and Zhang, W.W. and Chen, K. and Liu, X.H.",
        TITLE = "SCANREASON: Empowering 3d Visual Grounding with Reasoning Capabilities",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "VIII: 151-168",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223538"}

@inproceedings{bb228540,
        AUTHOR = "Xiao, Z. and Gong, M. and Cascante Bonilla, P. and Zhang, X.Y. and Wu, J. and Ordonez, V.",
        TITLE = "Grounding Language Models for Visual Entity Recognition",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XI: 393-411",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223539"}

@inproceedings{bb228541,
        AUTHOR = "Cheng, Z.X. and Pu, Y.J. and Gong, S.G. and Kordjamshidi, P. and Kong, Y.",
        TITLE = "Shine: Saliency-aware Hierarchical Negative Ranking for Compositional
Temporal Grounding",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XIX: 398-416",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223540"}

@inproceedings{bb228542,
        AUTHOR = "Lee, P.Y. and Sung, M.",
        TITLE = "Reground: Improving Textual and Spatial Grounding at No Cost",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXIII: 275-292",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223541"}

@inproceedings{bb228543,
        AUTHOR = "Jiang, H.B. and Lu, Z.Q.",
        TITLE = "Visual Grounding for Object-level Generalization in Reinforcement
Learning",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXX: 55-72",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223542"}

@inproceedings{bb228544,
        AUTHOR = "Sun, P.L. and Song, Y.X. and Pan, X. and Kang, W.T. and Liu, G. and Shah, M. and Yan, Y.",
        TITLE = "SEGVG: Transferring Object Bounding Box to Segmentation for Visual
Grounding",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXXVIII: 57-75",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223543"}

@inproceedings{bb228545,
        AUTHOR = "Kang, D. and Cho, M.",
        TITLE = "In Defense of Lazy Visual Grounding for Open-vocabulary Semantic
Segmentation",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLI: 143-164",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223544"}

@inproceedings{bb228546,
        AUTHOR = "Liu, Y. and He, J. and Li, W. and Kim, J. and Wei, D.L. and Pfister, H. and Chen, C.W.",
        TITLE = "R^1-tuning: Efficient Image-to-video Transfer Learning for Video
Temporal Grounding",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLI: 421-438",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223545"}

@inproceedings{bb228547,
        AUTHOR = "Zhang, H. and Li, H.Y. and Li, F. and Ren, T. and Zou, X. and Liu, S. and Huang, S.J. and Gao, J.F. and Zhang, L. and Li, C.Y. and Yang, J.W.",
        TITLE = "LLAVA-Grounding: Grounded Visual Chat with Large Multimodal Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLIII: 19-35",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223546"}

@inproceedings{bb228548,
        AUTHOR = "Yang, J. and Ding, R. and Brown, E. and Qi, X.J. and Xie, S.",
        TITLE = "V-IRL: Grounding Virtual Intelligence in Real Life",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLV: 36-55",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223547"}

@inproceedings{bb228549,
        AUTHOR = "Chen, W. and Chen, L. and Wu, Y.",
        TITLE = "An Efficient and Effective Transformer Decoder-based Framework for
Multi-task Visual Grounding",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLV: 125-141",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223548"}

@inproceedings{bb228550,
        AUTHOR = "Qian, Z.P. and Ma, Y.W. and Lin, Z.K. and Ji, J.Y. and Zheng, X. and Sun, X.S. and Ji, R.R.",
        TITLE = "Multi-branch Collaborative Learning Network for 3d Visual Grounding",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLVI: 381-398",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223549"}

@inproceedings{bb228551,
        AUTHOR = "Liu, S. and Zeng, Z.Y. and Ren, T. and Li, F. and Zhang, H. and Yang, J. and Jiang, Q. and Li, C.Y. and Yang, J.W. and Su, H. and Zhu, J. and Zhang, L.",
        TITLE = "Grounding Dino: Marrying Dino with Grounded Pre-training for Open-set
Object Detection",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLVII: 38-55",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223550"}

@inproceedings{bb228552,
        AUTHOR = "Jin, Y. and Mu, Y.D.",
        TITLE = "Weakly-supervised Spatio-temporal Video Grounding with Variational
Cross-modal Alignment",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLVIII: 412-429",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223551"}

@inproceedings{bb228553,
        AUTHOR = "Fujiwara, K. and Tanaka, M. and Yu, Q.",
        TITLE = "Chronologically Accurate Retrieval for Temporal Grounding of
Motion-language Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LVIII: 323-339",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223552"}

@inproceedings{bb228554,
        AUTHOR = "Yan, S. and Bai, M. and Chen, W.F. and Zhou, X. and Huang, Q.X. and Li, L.E.",
        TITLE = "Vigor: Improving Visual Grounding of Large Vision Language Models with
Fine-grained Reward Modeling",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXI: 37-53",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223553"}

@inproceedings{bb228555,
        AUTHOR = "Chowdhury, S. and Nag, S. and Dasgupta, S. and Chen, J. and Elhoseiny, M. and Gao, R.H. and Manocha, D.",
        TITLE = "Meerkat: Audio-visual Large Language Model for Grounding in Space and
Time",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXIV: 52-70",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223554"}

@inproceedings{bb228556,
        AUTHOR = "Leroy, V. and Cabon, Y. and Revaud, J.",
        TITLE = "Grounding Image Matching in 3d with Mast3r",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXII: 71-91",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223555"}

@inproceedings{bb228557,
        AUTHOR = "Unal, O. and Sakaridis, C. and Saha, S. and Van Gool, L.J.",
        TITLE = "Four Ways to Improve Verbo-visual Fusion for Dense 3d Visual Grounding",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXVI: 196-213",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223556"}

@inproceedings{bb228558,
        AUTHOR = "Wan, D. and Cho, J. and Stengel Eskin, E. and Bansal, M.",
        TITLE = "Contrastive Region Guidance: Improving Grounding in Vision-language
Models Without Training",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXIX: 198-215",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223557"}

@inproceedings{bb228559,
        AUTHOR = "Zheng, M.H. and Cai, X.H. and Chen, Q.C. and Peng, Y.X. and Liu, Y.",
        TITLE = "Training-Free Video Temporal Grounding Using Large-Scale Pre-Trained
Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXXII: 20-37",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223558"}

@inproceedings{bb228560,
        AUTHOR = "Bao, P.J. and Shao, Z. and Yang, W.H. and Ng, B.P. and Kot, A.C.",
        TITLE = "E3m: Zero-shot Spatio-temporal Video Grounding with
Expectation-maximization Multimodal Modulation",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXXIII: 227-243",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223559"}

@inproceedings{bb228561,
        AUTHOR = "Dong, P.J. and Yang, X.F. and Wang, Q. and Li, Z.X. and Li, T. and Chu, X.W.",
        TITLE = "Multi-task Domain Adaptation for Language Grounding with 3d Objects",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXXIV: 387-404",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223560"}

@inproceedings{bb228562,
        AUTHOR = "Hannan, T. and Islam, M.M. and Seidl, T. and Bertasius, G.",
        TITLE = "RGNET: A Unified Clip Retrieval and Grounding Network for Long Videos",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXI: 352-369",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223561"}

@inproceedings{bb228563,
        AUTHOR = "Khoshsirat, S. and Kambhamettu, C.",
        TITLE = "Embedding Attention Blocks for Answer Grounding",
        BOOKTITLE = ICIP24,
        YEAR = "2024",
        PAGES = "521-527",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223562"}

@inproceedings{bb228564,
        AUTHOR = "Hamilton, M. and Zisserman, A. and Hershey, J.R. and Freeman, W.T.",
        TITLE = "Separating the 'Chirp' from the 'Chat':
Self-supervised Visual Grounding of Sound and Language",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13117-13127",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223563"}

@inproceedings{bb228565,
        AUTHOR = "Shen, Y.H. and Fu, C.Y. and Chen, P.X. and Zhang, M. and Li, K. and Sun, X. and Wu, Y.S. and Lin, S.H. and Ji, R.R.",
        TITLE = "Aligning and Prompting Everything All at Once for Universal Visual
Perception",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13193-13203",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223564"}

@inproceedings{bb228566,
        AUTHOR = "Wu, T.H. and Biamby, G. and Chan, D. and Dunlap, L. and Gupta, R. and Wang, X.D. and Gonzalez, J.E. and Darrell, T.J.",
        TITLE = "See, Say, and Segment: Teaching LMMs to Overcome False Premises",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13459-13469",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223565"}

@inproceedings{bb228567,
        AUTHOR = "Wang, Y. and Li, Y. and Wang, S.J.",
        TITLE = "G3-LQ: Marrying Hyperbolic Alignment with Explicit Semantic-Geometric
Modeling for 3D Visual Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13917-13926",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223566"}

@inproceedings{bb228568,
        AUTHOR = "Rizve, M.N. and Fei, F. and Unnikrishnan, J. and Tran, S. and Yao, B.Z. and Zeng, B. and Shah, M. and Chilimbi, T.",
        TITLE = "VidLA: Video-Language Alignment at Scale",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14043-14055",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223567"}

@inproceedings{bb228569,
        AUTHOR = "Shi, X.X. and Wu, Z.H. and Lee, S.",
        TITLE = "Viewpoint-Aware Visual Grounding in 3D Scenes",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14056-14065",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223568"}

@inproceedings{bb228570,
        AUTHOR = "Feng, C.J. and Zhong, Y.J. and Jie, Z.Q. and Xie, W. and Ma, L.",
        TITLE = "InstaGen: Enhancing Object Detection by Training on Synthetic Dataset",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14121-14130",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223569"}

@inproceedings{bb228571,
        AUTHOR = "Chang, C.P. and Wang, S.X. and Pagani, A. and Stricker, D.",
        TITLE = "MiKASA: Multi-Key-Anchor & Scene-Aware Transformer for 3D Visual
Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14131-14140",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223570"}

@inproceedings{bb228572,
        AUTHOR = "Wang, S. and Lin, Y.T. and Wu, Y.",
        TITLE = "Omni-Q: Omni-Directional Scene Understanding for Unsupervised Visual
Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14261-14270",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223571"}

@inproceedings{bb228573,
        AUTHOR = "Favero, A. and Zancato, L. and Trager, M. and Choudhary, S. and Perera, P. and Achille, A. and Swaminathan, A. and Soatto, S.",
        TITLE = "Multi-Modal Hallucination Control by Visual Information Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14303-14312",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223572"}

@inproceedings{bb228574,
        AUTHOR = "Shen, Y.H. and Wang, H.Y. and Yang, X.T. and Feiszli, M. and Elhamifar, E. and Torresani, L. and Mavroudi, E.",
        TITLE = "Learning to Segment Referred Objects from Narrated Egocentric Videos",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14510-14520",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223573"}

@inproceedings{bb228575,
        AUTHOR = "Xu, C. and Han, Y.H. and Xu, R. and Hui, L. and Xie, J. and Yang, J.",
        TITLE = "Multi-Attribute Interactions Matter for 3D Visual Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "17253-17262",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223574"}

@inproceedings{bb228576,
        AUTHOR = "Gu, X. and Fan, H. and Huang, Y. and Luo, T.J. and Zhang, L.",
        TITLE = "Context-Guided Spatio-Temporal Video Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18330-18339",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223575"}

@inproceedings{bb228577,
        AUTHOR = "Chen, B. and Shvetsova, N. and Rouditchenko, A. and Kondermann, D. and Thomas, S. and Chang, S.F. and Feris, R. and Glass, J. and Kuehne, H.",
        TITLE = "What, When, and Where? Self-Supervised Spatio- Temporal Grounding in
Untrimmed Multi-Action Videos from Narrated Instructions",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18419-18429",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223576"}

@inproceedings{bb228578,
        AUTHOR = "Xiao, Y.C. and Luo, Z. and Liu, Y. and Ma, Y. and Bian, H. and Ji, Y. and Yang, Y. and Li, X.",
        TITLE = "Bridging the Gap: A Unified Video Comprehension Framework for Moment
Retrieval and Highlight Detection",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18709-18719",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223577"}

@inproceedings{bb228579,
        AUTHOR = "Wasim, S.T. and Naseer, M. and Khan, S. and Yang, M.H. and Khan, F.S.",
        TITLE = "VideoGrounding-DINO: Towards Open-Vocabulary Spatio- Temporal Video
Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18909-18918",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223578"}

@inproceedings{bb228580,
        AUTHOR = "Shao, Y.Y. and He, S.T. and Ye, Q. and Feng, Y.C. and Luo, W.H. and Chen, J.M.",
        TITLE = "Context-Aware Integration of Language and Visual References for
Natural Language Tracking",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "19208-19217",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223579"}

@inproceedings{bb228581,
        AUTHOR = "Tao, M. and Bai, B. and Lin, H.Z. and Wang, H. and Wang, Y. and Luo, L. and Fang, L.",
        TITLE = "When Visual Grounding Meets Gigapixel-Level Large-Scale Scenes:
Benchmark and Approach",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "22119-22128",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223580"}

@inproceedings{bb228582,
        AUTHOR = "Chng, Y.X. and Zheng, H. and Han, Y.Z. and Qiu, X. and Huang, G.",
        TITLE = "Mask Grounding for Referring Image Segmentation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26563-26573",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223581"}

@inproceedings{bb228583,
        AUTHOR = "Chen, K. and Wu, X.Q.",
        TITLE = "VTQA: Visual Text Question Answering via Entity Alignment and
Cross-Media Reasoning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27208-27217",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223582"}

@inproceedings{bb228584,
        AUTHOR = "Kuckreja, K. and Danish, M.S. and Naseer, M. and Das, A. and Khan, S. and Khan, F.S.",
        TITLE = "GeoChat: Grounded Large Vision-Language Model for Remote Sensing",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27831-27840",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223583"}

@inproceedings{bb228585,
        AUTHOR = "Shah, N.A. and VS, V. and Patel, V.M.",
        TITLE = "LQMFormer: Language-Aware Query Mask Transformer for Referring Image
Segmentation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12903-12913",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223584"}

@inproceedings{bb228586,
        AUTHOR = "Wang, W.X. and Yue, T.T. and Zhang, Y. and Guo, L.T. and He, X.J. and Wang, X.L. and Liu, J.",
        TITLE = "Unveiling Parts Beyond Objects: Towards Finer-Granularity Referring
Expression Segmentation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12998-13008",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223585"}

@inproceedings{bb228587,
        AUTHOR = "Rasheed, H. and Maaz, M. and Shaji, S. and Shaker, A. and Khan, S. and Cholakkal, H. and Anwer, R.M. and Xing, E. and Yang, M.H. and Khan, F.S.",
        TITLE = "GLaMM: Pixel Grounding Large Multimodal Model",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13009-13018",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223586"}

@inproceedings{bb228588,
        AUTHOR = "Zhang, Y.Q. and Luo, H. and Lei, Y.J.",
        TITLE = "Towards CLIP-Driven Language-Free 3D Visual Grounding via 2D-3D
Relational Enhancement and Consistency",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13063-13072",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223587"}

@inproceedings{bb228589,
        AUTHOR = "Zhang, C. and Li, M. and Budvytis, I. and Liwicki, S.",
        TITLE = "DiaLoc: An Iterative Approach to Embodied Dialog Localization",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12585-12593",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223588"}

@inproceedings{bb228590,
        AUTHOR = "Xiao, B. and Wu, H.P. and Xu, W.J. and Dai, X.Y. and Hu, H.D. and Lu, Y. and Zeng, M. and Liu, C. and Yuan, L.",
        TITLE = "Florence-2: Advancing a Unified Representation for a Variety of
Vision Tasks",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "4818-4829",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223589"}

@inproceedings{bb228591,
        AUTHOR = "Qian, S.Y. and Chen, W.F. and Bai, M. and Zhou, X. and Tu, Z.W. and Li, L.E.",
        TITLE = "AffordanceLLM: Grounding Affordance from Vision Language Models",
        BOOKTITLE = OpenSUN3D24,
        YEAR = "2024",
        PAGES = "7587-7597",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223590"}

@inproceedings{bb228592,
        AUTHOR = "Di, S.Z. and Xie, W.",
        TITLE = "Grounded Question-Answering in Long Egocentric Videos",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12934-12943",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223591"}

@inproceedings{bb228593,
        AUTHOR = "Miyanishi, T. and Azuma, D. and Kurita, S. and Kawanabe, M.",
        TITLE = "Cross3DVG: Cross-Dataset 3D Visual Grounding on Different RGB-D Scans",
        BOOKTITLE = "3DV24",
        YEAR = "2024",
        PAGES = "717-727",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223592"}

@inproceedings{bb228594,
        AUTHOR = "Gong, R. and Huang, J.Y. and Zhao, Y.Z. and Geng, H.R. and Gao, X.F. and Wu, Q.Y. and Ai, W. and Zhou, Z.H. and Terzopoulos, D. and Zhu, S.C. and Jia, B.X. and Huang, S.Y.",
        TITLE = "ARNOLD: A Benchmark for Language-Grounded Task Learning With
Continuous States in Realistic 3D Scenes",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "20426-20438",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223593"}

@inproceedings{bb228595,
        AUTHOR = "Wu, Y. and Wei, Y. and Wang, H.Z. and Liu, Y.F. and Yang, S. and He, X.M.",
        TITLE = "Grounded Image Text Matching with Mismatched Relation Reasoning",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2964-2975",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223594"}

@inproceedings{bb228596,
        AUTHOR = "Song, C.H. and Sadler, B.M. and Wu, J. and Chao, W.L. and Washington, C. and Su, Y.",
        TITLE = "LLM-Planner: Few-Shot Grounded Planning for Embodied Agents with
Large Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2986-2997",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223595"}

@inproceedings{bb228597,
        AUTHOR = "Lee, C. and Kumar, M.G. and Tan, C.",
        TITLE = "DetermiNet: A Large-Scale Diagnostic Dataset for Complex
Visually-Grounded Referencing using Determiners",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "19962-19971",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223596"}

@inproceedings{bb228598,
        AUTHOR = "Lin, K.Q. and Zhang, P. and Chen, J. and Pramanick, S. and Gao, D.F. and Wang, A.J.P. and Yan, R. and Shou, M.Z.",
        TITLE = "UniVTG: Towards Unified Video-Language Temporal Grounding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2782-2792",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223597"}

@inproceedings{bb228599,
        AUTHOR = "Liu, Y. and Zhang, J.H. and Chen, Q.C. and Peng, Y.X.",
        TITLE = "Confidence-aware Pseudo-label Learning for Weakly Supervised Visual
Grounding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2816-2826",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT223598"}

Last update:Mar 29, 2025 at 10:46:14