@inproceedings{bb231400, AUTHOR = "Leroy, V. and Cabon, Y. and Revaud, J.", TITLE = "Grounding Image Matching in 3d with Mast3r", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXXII: 71-91", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226384"} @inproceedings{bb231401, AUTHOR = "Unal, O. and Sakaridis, C. and Saha, S. and Van Gool, L.J.", TITLE = "Four Ways to Improve Verbo-visual Fusion for Dense 3d Visual Grounding", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXXVI: 196-213", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226385"} @inproceedings{bb231402, AUTHOR = "Wan, D. and Cho, J. and Stengel Eskin, E. and Bansal, M.", TITLE = "Contrastive Region Guidance: Improving Grounding in Vision-language Models Without Training", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXXIX: 198-215", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226386"} @inproceedings{bb231403, AUTHOR = "Zheng, M.H. and Cai, X.H. and Chen, Q.C. and Peng, Y.X. and Liu, Y.", TITLE = "Training-Free Video Temporal Grounding Using Large-Scale Pre-Trained Models", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXXXII: 20-37", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226387"} @inproceedings{bb231404, AUTHOR = "Bao, P.J. and Shao, Z. and Yang, W.H. and Ng, B.P. and Kot, A.C.", TITLE = "E3m: Zero-shot Spatio-temporal Video Grounding with Expectation-maximization Multimodal Modulation", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXXXIII: 227-243", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226388"} @inproceedings{bb231405, AUTHOR = "Dong, P.J. and Yang, X.F. and Wang, Q. and Li, Z.X. and Li, T. and Chu, X.W.", TITLE = "Multi-task Domain Adaptation for Language Grounding with 3d Objects", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "XXXIV: 387-404", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226389"} @inproceedings{bb231406, AUTHOR = "Hannan, T. and Islam, M.M. and Seidl, T. and Bertasius, G.", TITLE = "RGNET: A Unified Clip Retrieval and Grounding Network for Long Videos", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "XXI: 352-369", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226390"} @inproceedings{bb231407, AUTHOR = "Khoshsirat, S. and Kambhamettu, C.", TITLE = "Embedding Attention Blocks for Answer Grounding", BOOKTITLE = ICIP24, YEAR = "2024", PAGES = "521-527", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226391"} @inproceedings{bb231408, AUTHOR = "Hamilton, M. and Zisserman, A. and Hershey, J.R. and Freeman, W.T.", TITLE = "Separating the 'Chirp' from the 'Chat': Self-supervised Visual Grounding of Sound and Language", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13117-13127", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226392"} @inproceedings{bb231409, AUTHOR = "Shen, Y.H. and Fu, C.Y. and Chen, P.X. and Zhang, M. and Li, K. and Sun, X. and Wu, Y.S. and Lin, S.H. and Ji, R.R.", TITLE = "Aligning and Prompting Everything All at Once for Universal Visual Perception", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13193-13203", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226393"} @inproceedings{bb231410, AUTHOR = "Wu, T.H. and Biamby, G. and Chan, D. and Dunlap, L. and Gupta, R. and Wang, X.D. and Gonzalez, J.E. and Darrell, T.J.", TITLE = "See, Say, and Segment: Teaching LMMs to Overcome False Premises", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13459-13469", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226394"} @inproceedings{bb231411, AUTHOR = "Wang, Y. and Li, Y. and Wang, S.J.", TITLE = "G3-LQ: Marrying Hyperbolic Alignment with Explicit Semantic-Geometric Modeling for 3D Visual Grounding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13917-13926", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226395"} @inproceedings{bb231412, AUTHOR = "Rizve, M.N. and Fei, F. and Unnikrishnan, J. and Tran, S. and Yao, B.Z. and Zeng, B. and Shah, M. and Chilimbi, T.", TITLE = "VidLA: Video-Language Alignment at Scale", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14043-14055", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226396"} @inproceedings{bb231413, AUTHOR = "Shi, X.X. and Wu, Z.H. and Lee, S.", TITLE = "Viewpoint-Aware Visual Grounding in 3D Scenes", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14056-14065", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226397"} @inproceedings{bb231414, AUTHOR = "Feng, C.J. and Zhong, Y.J. and Jie, Z.Q. and Xie, W. and Ma, L.", TITLE = "InstaGen: Enhancing Object Detection by Training on Synthetic Dataset", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14121-14130", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226398"} @inproceedings{bb231415, AUTHOR = "Chang, C.P. and Wang, S.X. and Pagani, A. and Stricker, D.", TITLE = "MiKASA: Multi-Key-Anchor & Scene-Aware Transformer for 3D Visual Grounding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14131-14140", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226399"} @inproceedings{bb231416, AUTHOR = "Wang, S. and Lin, Y.T. and Wu, Y.", TITLE = "Omni-Q: Omni-Directional Scene Understanding for Unsupervised Visual Grounding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14261-14270", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226400"} @inproceedings{bb231417, AUTHOR = "Favero, A. and Zancato, L. and Trager, M. and Choudhary, S. and Perera, P. and Achille, A. and Swaminathan, A. and Soatto, S.", TITLE = "Multi-Modal Hallucination Control by Visual Information Grounding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14303-14312", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226401"} @inproceedings{bb231418, AUTHOR = "Shen, Y.H. and Wang, H.Y. and Yang, X.T. and Feiszli, M. and Elhamifar, E. and Torresani, L. and Mavroudi, E.", TITLE = "Learning to Segment Referred Objects from Narrated Egocentric Videos", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14510-14520", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226402"} @inproceedings{bb231419, AUTHOR = "Xu, C. and Han, Y.H. and Xu, R. and Hui, L. and Xie, J. and Yang, J.", TITLE = "Multi-Attribute Interactions Matter for 3D Visual Grounding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "17253-17262", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226403"} @inproceedings{bb231420, AUTHOR = "Gu, X. and Fan, H. and Huang, Y. and Luo, T.J. and Zhang, L.", TITLE = "Context-Guided Spatio-Temporal Video Grounding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18330-18339", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226404"} @inproceedings{bb231421, AUTHOR = "Chen, B. and Shvetsova, N. and Rouditchenko, A. and Kondermann, D. and Thomas, S. and Chang, S.F. and Feris, R. and Glass, J. and Kuehne, H.", TITLE = "What, When, and Where? Self-Supervised Spatio- Temporal Grounding in Untrimmed Multi-Action Videos from Narrated Instructions", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18419-18429", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226405"} @inproceedings{bb231422, AUTHOR = "Xiao, Y.C. and Luo, Z.Y. and Liu, Y. and Ma, Y. and Bian, H.W. and Ji, Y. and Yang, Y.J. and Li, X.", TITLE = "Bridging the Gap: A Unified Video Comprehension Framework for Moment Retrieval and Highlight Detection", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18709-18719", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226406"} @inproceedings{bb231423, AUTHOR = "Wasim, S.T. and Naseer, M. and Khan, S. and Yang, M.H. and Khan, F.S.", TITLE = "VideoGrounding-DINO: Towards Open-Vocabulary Spatio- Temporal Video Grounding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18909-18918", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226407"} @inproceedings{bb231424, AUTHOR = "Shao, Y.Y. and He, S.T. and Ye, Q. and Feng, Y.C. and Luo, W.H. and Chen, J.M.", TITLE = "Context-Aware Integration of Language and Visual References for Natural Language Tracking", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "19208-19217", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226408"} @inproceedings{bb231425, AUTHOR = "Tao, M. and Bai, B. and Lin, H.Z. and Wang, H. and Wang, Y. and Luo, L. and Fang, L.", TITLE = "When Visual Grounding Meets Gigapixel-Level Large-Scale Scenes: Benchmark and Approach", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "22119-22128", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226409"} @inproceedings{bb231426, AUTHOR = "Kuckreja, K. and Danish, M.S. and Naseer, M. and Das, A. and Khan, S. and Khan, F.S.", TITLE = "GeoChat: Grounded Large Vision-Language Model for Remote Sensing", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27831-27840", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226410"} @inproceedings{bb231427, AUTHOR = "Rasheed, H. and Maaz, M. and Shaji, S. and Shaker, A. and Khan, S. and Cholakkal, H. and Anwer, R.M. and Xing, E. and Yang, M.H. and Khan, F.S.", TITLE = "GLaMM: Pixel Grounding Large Multimodal Model", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13009-13018", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226411"} @inproceedings{bb231428, AUTHOR = "Zhang, Y.Q. and Luo, H. and Lei, Y.J.", TITLE = "Towards CLIP-Driven Language-Free 3D Visual Grounding via 2D-3D Relational Enhancement and Consistency", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13063-13072", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226412"} @inproceedings{bb231429, AUTHOR = "Zhang, C. and Li, M. and Budvytis, I. and Liwicki, S.", TITLE = "DiaLoc: An Iterative Approach to Embodied Dialog Localization", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "12585-12593", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226413"} @inproceedings{bb231430, AUTHOR = "Xiao, B. and Wu, H.P. and Xu, W.J. and Dai, X.Y. and Hu, H.D. and Lu, Y. and Zeng, M. and Liu, C. and Yuan, L.", TITLE = "Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "4818-4829", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226414"} @inproceedings{bb231431, AUTHOR = "Qian, S.Y. and Chen, W.F. and Bai, M. and Zhou, X. and Tu, Z.W. and Li, L.E.", TITLE = "AffordanceLLM: Grounding Affordance from Vision Language Models", BOOKTITLE = OpenSUN3D24, YEAR = "2024", PAGES = "7587-7597", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226415"} @inproceedings{bb231432, AUTHOR = "Miyanishi, T. and Azuma, D. and Kurita, S. and Kawanabe, M.", TITLE = "Cross3DVG: Cross-Dataset 3D Visual Grounding on Different RGB-D Scans", BOOKTITLE = "3DV24", YEAR = "2024", PAGES = "717-727", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226416"} @inproceedings{bb231433, AUTHOR = "Gong, R. and Huang, J.Y. and Zhao, Y.Z. and Geng, H.R. and Gao, X.F. and Wu, Q.Y. and Ai, W. and Zhou, Z.H. and Terzopoulos, D. and Zhu, S.C. and Jia, B.X. and Huang, S.Y.", TITLE = "ARNOLD: A Benchmark for Language-Grounded Task Learning With Continuous States in Realistic 3D Scenes", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "20426-20438", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226417"} @inproceedings{bb231434, AUTHOR = "Wu, Y. and Wei, Y. and Wang, H.Z. and Liu, Y.F. and Yang, S. and He, X.M.", TITLE = "Grounded Image Text Matching with Mismatched Relation Reasoning", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2964-2975", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226418"} @inproceedings{bb231435, AUTHOR = "Song, C.H. and Sadler, B.M. and Wu, J. and Chao, W.L. and Washington, C. and Su, Y.", TITLE = "LLM-Planner: Few-Shot Grounded Planning for Embodied Agents with Large Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2986-2997", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226419"} @inproceedings{bb231436, AUTHOR = "Lee, C. and Kumar, M.G. and Tan, C.", TITLE = "DetermiNet: A Large-Scale Diagnostic Dataset for Complex Visually-Grounded Referencing using Determiners", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "19962-19971", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226420"} @inproceedings{bb231437, AUTHOR = "Lin, K.Q. and Zhang, P. and Chen, J. and Pramanick, S. and Gao, D.F. and Wang, A.J.P. and Yan, R. and Shou, M.Z.", TITLE = "UniVTG: Towards Unified Video-Language Temporal Grounding", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2782-2792", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226421"} @inproceedings{bb231438, AUTHOR = "Liu, Y. and Zhang, J.H. and Chen, Q.C. and Peng, Y.X.", TITLE = "Confidence-aware Pseudo-label Learning for Weakly Supervised Visual Grounding", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2816-2826", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226422"} @inproceedings{bb231439, AUTHOR = "Khoshsirat, S. and Kambhamettu, C.", TITLE = "Sentence Attention Blocks for Answer Grounding", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "6057-6067", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226423"} @inproceedings{bb231440, AUTHOR = "Li, H.X. and Cao, M. and Cheng, X. and Li, Y. and Zhu, Z.H. and Zou, Y.X.", TITLE = "G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and Game Theory", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "11998-12008", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226424"} @inproceedings{bb231441, AUTHOR = "Li, H. and Shu, X.J. and He, S. and Qiao, R.Z. and Wen, W. and Guo, T. and Gan, B. and Sun, X.", TITLE = "D3G: Exploring Gaussian Prior for Temporal Sentence Grounding with Glance Annotation", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "13688-13700", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226425"} @inproceedings{bb231442, AUTHOR = "Pan, Y.L. and He, X.T. and Gong, B. and Lv, Y.L. and Shen, Y.J. and Peng, Y.X. and Zhao, D.L.", TITLE = "Scanning Only Once: An End-to-end Framework for Fast Temporal Grounding in Long Videos", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "13721-13731", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226426"} @inproceedings{bb231443, AUTHOR = "Jang, J. and Park, J. and Kim, J. and Kwon, H. and Sohn, K.H.", TITLE = "Knowing Where to Focus: Event-aware Transformer for Video Grounding", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "13800-13810", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226427"} @inproceedings{bb231444, AUTHOR = "Zhang, Y.M. and Gong, Z. and Chang, A.X.", TITLE = "Multi3DRefer: Grounding Text Description to Multiple 3D Objects", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15179-15179", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226428"} @inproceedings{bb231445, AUTHOR = "Li, H. and Wei, P. and Ma, Z. and Zheng, N.N.", TITLE = "Inverse Compositional Learning for Weakly-supervised Relation Grounding", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15431-15441", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226429"} @inproceedings{bb231446, AUTHOR = "Chen, D.Z.Y. and Hu, R.H. and Chen, X.L. and Nießner, M. and Chang, A.X.", TITLE = "UniT3D: A Unified Transformer for 3D Dense Captioning and Visual Grounding", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "18063-18073", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226430"} @inproceedings{bb231447, AUTHOR = "de la Jara, I.M. and Rodriguez Opazo, C. and Marrese Taylor, E. and Bravo Marquez, F.", TITLE = "An empirical study of the effect of video encoders on Temporal Video Grounding", BOOKTITLE = CLVL23, YEAR = "2023", PAGES = "2842-2847", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226431"} @inproceedings{bb231448, AUTHOR = "Wang, Z. and Huang, H.F. and Zhao, Y. and Li, L.J. and Cheng, X.Z. and Zhu, Y.C. and Yin, A. and Zhao, Z.", TITLE = "Distilling Coarse-to-Fine Semantic Matching Knowledge for Weakly Supervised 3D Visual Grounding", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2662-2671", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226432"} @inproceedings{bb231449, AUTHOR = "Guo, Z. and Tang, Y.W. and Zhang, R. and Wang, D. and Wang, Z.G. and Zhao, B. and Li, X.L.", TITLE = "ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15326-15337", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226433"} @inproceedings{bb231450, AUTHOR = "Hsu, J. and Mao, J.Y. and Wu, J.J.", TITLE = "NS3D: Neuro-Symbolic Grounding of 3D Objects and Relations", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2614-2623", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226434"} @inproceedings{bb231451, AUTHOR = "Yi, J. and Uzkent, B. and Ignat, O. and Li, Z.L. and Garg, A. and Yu, X. and Liu, L.", TITLE = "Augment the Pairs: Semantics-Preserving Image-Caption Pair Augmentation for Grounding-Based Vision and Language Models", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "5508-5518", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226435"} @inproceedings{bb231452, AUTHOR = "Uzkent, B. and Garg, A. and Zhu, W.T. and Doshi, K. and Yi, J. and Wang, X.L. and Omar, M.", TITLE = "Dynamic Inference with Grounding Based Vision and Language Models", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2624-2633", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226436"} @inproceedings{bb231453, AUTHOR = "Cao, M. and Wei, F.Y. and Xu, C. and Geng, X. and Chen, L. and Zhang, C. and Zou, Y.X. and Shen, T. and Jiang, D.X.", TITLE = "Iterative Proposal Refinement for Weakly-Supervised Video Grounding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6524-6534", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226437"} @inproceedings{bb231454, AUTHOR = "Wang, L. and Mittal, G. and Sajeev, S. and Yu, Y. and Hall, M. and Boddeti, V.N. and Chen, M.", TITLE = "ProTéGé: Untrimmed Pretraining for Video Temporal Grounding by Video Temporal Grounding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6575-6585", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226438"} @inproceedings{bb231455, AUTHOR = "Hwang, M.Y. and Jeong, J.Y. and Kim, M.S. and Oh, Y. and Oh, S.H.", TITLE = "Meta-Explore: Exploratory Hierarchical Vision-and-Language Navigation Using Scene Object Spectrum Grounding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6683-6693", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226439"} @inproceedings{bb231456, AUTHOR = "Chen, J. and Gao, D.F. and Lin, K.Q. and Shou, M.Z.", TITLE = "Affordance Grounding from Demonstration Video to Target Image", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6799-6808", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226440"} @inproceedings{bb231457, AUTHOR = "Shaharabany, T. and Wolf, L.B.", TITLE = "Similarity Maps for Self-Training Weakly-Supervised Phrase Grounding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6925-6934", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226441"} @inproceedings{bb231458, AUTHOR = "Su, W. and Miao, P. and Dou, H.Z. and Wang, G. and Qiao, L. and Li, Z. and Li, X.", TITLE = "Language Adaptive Weight Generation for Multi-Task Visual Grounding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10857-10866", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226442"} @inproceedings{bb231459, AUTHOR = "Li, G. and Jampani, V. and Sun, D.Q. and Sevilla Lara, L.", TITLE = "LOCATE: Localize and Transfer Object Parts for Weakly Supervised Affordance Grounding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10922-10931", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226443"} @inproceedings{bb231460, AUTHOR = "Kim, S. and Oh, J. and Lee, S. and Yu, S. and Do, J. and Taghavi, T.", TITLE = "Grounding Counterfactual Explanation of Image Classifiers to Textual Concept Space", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10942-10950", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226444"} @inproceedings{bb231461, AUTHOR = "Zhang, Y.M. and Chen, X. and Jia, J.H. and Liu, S. and Ding, K.", TITLE = "Text-Visual Prompting for Efficient 2D Temporal Video Grounding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14794-14804", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226445"} @inproceedings{bb231462, AUTHOR = "Chen, Z.H. and Zhang, R. and Song, Y.B. and Wan, X. and Li, G.B.", TITLE = "Advancing Visual Grounding with Scene Knowledge: Benchmark and Method", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "15039-15049", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226446"} @inproceedings{bb231463, AUTHOR = "Huang, Y.F. and Yang, L. and Sato, Y.", TITLE = "Weakly Supervised Temporal Sentence Grounding with Uncertainty-Guided Self-training", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "18908-18918", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226447"} @inproceedings{bb231464, AUTHOR = "Tan, C.L. and Lin, Z.H. and Hu, J.F. and Zheng, W.S. and Lai, J.H.", TITLE = "Hierarchical Semantic Correspondence Networks for Video Paragraph Grounding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "18973-18982", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226448"} @inproceedings{bb231465, AUTHOR = "Yang, Z.Y. and Kafle, K. and Dernoncourt, F. and Ordonez, V.", TITLE = "Improving Visual Grounding by Encouraging Consistent Gradient-Based Explanations", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "19165-19174", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226449"} @inproceedings{bb231466, AUTHOR = "Wu, Y.M. and Cheng, X.H. and Zhang, R.R. and Cheng, Z. and Zhang, J.", TITLE = "EDA: Explicit Text-Decoupling and Dense Alignment for 3D Visual Grounding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "19231-19242", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226450"} @inproceedings{bb231467, AUTHOR = "Li, M.Z. and Wang, H. and Zhang, W.Q. and Miao, J.X. and Zhao, Z. and Zhang, S.Y. and Ji, W. and Wu, F.", TITLE = "WINNER: Weakly-supervised hIerarchical decompositioN and aligNment for spatio-tEmporal video gRounding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "23090-23099", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226451"} @inproceedings{bb231468, AUTHOR = "Lin, Z.H. and Tan, C.L. and Hu, J.F. and Jin, Z. and Ye, T. and Zheng, W.S.", TITLE = "Collaborative Static and Dynamic Vision-Language Streams for Spatio-Temporal Video Grounding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "23100-23109", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226452"} @inproceedings{bb231469, AUTHOR = "Yang, L. and Kong, Q. and Yang, H.K. and Kehl, W. and Sato, Y. and Kobori, N.", TITLE = "DeCo: Decomposition and Reconstruction for Compositional Temporal Grounding via Coarse-to-Fine Contrastive Ranking", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "23130-23140", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226453"} @inproceedings{bb231470, AUTHOR = "Zhou, L. and Zhou, Z. and Mao, K. and He, Z.Y.", TITLE = "Joint Visual Grounding and Tracking with Natural Language Specification", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "23151-23160", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226454"} @inproceedings{bb231471, AUTHOR = "Devaraj, C. and Fermuller, C. and Aloimonos, Y.F.", TITLE = "Incorporating Visual Grounding In GCN For Zero-shot Learning Of Human Object Interaction Actions", BOOKTITLE = L3D-IVU23, YEAR = "2023", PAGES = "5008-5017", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226455"} @inproceedings{bb231472, AUTHOR = "Fang, X. and Liu, D.Z. and Zhou, P. and Nan, G.S.", TITLE = "You Can Ground Earlier than See: An Effective and Efficient Pipeline for Temporal Sentence Grounding in Compressed Videos", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2448-2460", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226456"} @inproceedings{bb231473, AUTHOR = "Fu, T.J. and Li, L.J. and Gan, Z. and Lin, K. and Wang, W.Y. and Wang, L.J. and Liu, Z.C.", TITLE = "An Empirical Study of End-to-End Video-Language Transformers with Masked Visual Modeling", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "22898-22909", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226457"} @inproceedings{bb231474, AUTHOR = "Li, L.J. and Gan, Z. and Lin, K. and Lin, C.C. and Liu, Z.C. and Liu, C. and Wang, L.J.", TITLE = "LAVENDER: Unifying Video-Language Understanding as Masked Language Modeling", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "23119-23129", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226458"} @inproceedings{bb231475, AUTHOR = "Dong, J.X. and Yin, Z.Z.", TITLE = "Boundary-aware Temporal Sentence Grounding with Adaptive Proposal Refinement", BOOKTITLE = ACCV22, YEAR = "2022", PAGES = "IV:641-657", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226459"} @inproceedings{bb231476, AUTHOR = "Gao, Y.Z. and Lu, Z.W.", TITLE = "SST-VLM: Sparse Sampling-twice Inspired Video-language Model", BOOKTITLE = ACCV22, YEAR = "2022", PAGES = "IV:537-553", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226460"} @inproceedings{bb231477, AUTHOR = "Pacheco Ortega, A. and Mayol Cuervas, W.", TITLE = "One-shot Learning for Human Affordance Detection", BOOKTITLE = CVMeta22, YEAR = "2022", PAGES = "758-766", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226461"} @inproceedings{bb231478, AUTHOR = "Ho, C.H. and Appalaraju, S. and Jasani, B. and Manmatha, R. and Vasconcelos, N.M.", TITLE = "YORO - Lightweight End to End Visual Grounding", BOOKTITLE = CMMP22, YEAR = "2022", PAGES = "3-23", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226462"} @inproceedings{bb231479, AUTHOR = "Kim, D. and Park, J. and Lee, J.Y. and Park, S. and Sohn, K.H.", TITLE = "Language-free Training for Zero-shot Video Grounding", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "2538-2547", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226463"} @inproceedings{bb231480, AUTHOR = "Chou, S.H. and Fan, Z.C. and Little, J.J. and Sigal, L.", TITLE = "Semi-Supervised Grounding Alignment for Multi-Modal Feature Learning", BOOKTITLE = CRV22, YEAR = "2022", PAGES = "48-57", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226464"} @inproceedings{bb231481, AUTHOR = "Chen, D.Z.Y. and Wu, Q.R. and Nießner, M. and Chang, A.X.", TITLE = "D 3 Net: A Unified Speaker-Listener Architecture for 3D Dense Captioning and Visual Grounding", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXII:487-505", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226465"} @inproceedings{bb231482, AUTHOR = "Parcalabescu, L. and Frank, A.", TITLE = "Exploring Phrase Grounding without Training: Contextualisation and Extension to Text-Based Image Retrieval", BOOKTITLE = MULWS20, YEAR = "2020", PAGES = "4137-4146", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226466"} @inproceedings{bb231483, AUTHOR = "Tung, H. and Harley, A.W. and Huang, L. and Fragkiadaki, K.", TITLE = "Reward Learning from Narrated Demonstrations", BOOKTITLE = CVPR18, YEAR = "2018", PAGES = "7004-7013", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226467"} @inproceedings{bb231484, AUTHOR = "Cohen, N. and Gal, R. and Meirom, E.A. and Chechik, G. and Atzmon, Y.", TITLE = "'This Is My Unicorn, Fluffy': Personalizing Frozen Vision-Language Representations", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XX:558-577", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226468"} @inproceedings{bb231485, AUTHOR = "Lee, J.H. and Kang, J.W.", TITLE = "Relation Enhanced Vision Language Pre-Training", BOOKTITLE = ICIP22, YEAR = "2022", PAGES = "2286-2290", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226469"} @inproceedings{bb231486, AUTHOR = "Khan, Z. and Kumar, B.G.V. and Yu, X. and Schulter, S. and Chandraker, M. and Fu, Y.", TITLE = "Single-Stream Multi-level Alignment for Vision-Language Pretraining", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:735-751", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226470"} @inproceedings{bb231487, AUTHOR = "Wang, R. and Zhao, H. and Gao, Y.", TITLE = "CYBORGS: Contrastively Bootstrapping Object Representations by Grounding in Segmentation", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXI:260-277", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226471"} @inproceedings{bb231488, AUTHOR = "Yang, Z.Y. and Gan, Z. and Wang, J.F. and Hu, X.W. and Ahmed, F. and Liu, Z.C. and Lu, Y. and Wang, L.J.", TITLE = "UniTAB: Unifying Text and Box Outputs for Grounded Vision-Language Modeling", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:521-539", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226472"} @inproceedings{bb231489, AUTHOR = "Li, H. and Wei, P. and Li, J.P. and Ma, Z. and Shang, J. and Zheng, N.N.", TITLE = "Asymmetric Relation Consistency Reasoning for Video Relation Grounding", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXV:125-141", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226473"} @inproceedings{bb231490, AUTHOR = "Dvornik, N. and Hadji, I. and Pham, H. and Bhatt, D. and Martinez, B. and Fazly, A. and Jepson, A.D.", TITLE = "Flow Graph to Video Grounding for Weakly-Supervised Multi-step Localization", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXV:319-335", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226474"} @inproceedings{bb231491, AUTHOR = "Qu, M.X. and Wu, Y. and Liu, W. and Gong, Q.Q. and Liang, X.D. and Russakovsky, O. and Zhao, Y. and Wei, Y.C.", TITLE = "SiRi: A Simple Selective Retraining Mechanism for Transformer-Based Visual Grounding", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXV:546-562", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226475"} @inproceedings{bb231492, AUTHOR = "Zhu, C.Y. and Zhou, Y. and Shen, Y.H. and Luo, G. and Pan, X.J. and Chen, M.B.L.C. and Cao, L.J. and Sun, X.S. and Ji, R.R.", TITLE = "SeqTR: A Simple Yet Universal Network for Visual Grounding", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXV:598-615", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226476"} @inproceedings{bb231493, AUTHOR = "Hao, J.C. and Sun, H.F. and Ren, P.F. and Wang, J.Y. and Qi, Q. and Liao, J.X.", TITLE = "Can Shuffling Video Benefit Temporal Bias Problem: A Novel Training Framework for Temporal Grounding", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:130-147", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226477"} @inproceedings{bb231494, AUTHOR = "Jain, A. and Gkanatsios, N. and Mediratta, I. and Fragkiadaki, K.", TITLE = "Bottom Up Top Down Detection Transformers for Language Grounding in Images and Point Clouds", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:417-433", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226478"} @inproceedings{bb231495, AUTHOR = "Heisler, M. and Banitalebi Dehkordi, A. and Zhang, Y.", TITLE = "SemAug: Semantically Meaningful Image Augmentations for Object Detection Through Language Grounding", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:610-626", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226479"} @inproceedings{bb231496, AUTHOR = "Min, S. and Park, N. and Kim, S. and Park, S.H. and Kim, J.", TITLE = "Grounding Visual Representations with Texts for Domain Generalization", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVII:37-53", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226480"} @inproceedings{bb231497, AUTHOR = "Wang, J. and Wu, H.Y. and Chen, J.C. and Shuai, H.H. and Cheng, W.H.", TITLE = "Residual Graph Attention Network and Expression-Respect Data Augmentation Aided Visual Grounding", BOOKTITLE = ICIP22, YEAR = "2022", PAGES = "326-330", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226481"} @inproceedings{bb231498, AUTHOR = "Xiong, Z. and Liu, D. and Zhou, P.", TITLE = "Gaussian Kernel-Based Cross Modal Network for Spatio-Temporal Video Grounding", BOOKTITLE = ICIP22, YEAR = "2022", PAGES = "2481-2485", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226482"} @inproceedings{bb231499, AUTHOR = "Alaniz, S. and Federici, M. and Akata, Z.", TITLE = "Compositional Mixture Representations for Vision and Text", BOOKTITLE = L3D-IVU22, YEAR = "2022", PAGES = "4201-4210", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT226483"}