@inproceedings{bb230500,
        AUTHOR = "Tan, C.L. and Lin, Z.H. and Hu, J.F. and Zheng, W.S. and Lai, J.H.",
        TITLE = "Hierarchical Semantic Correspondence Networks for Video Paragraph
Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "18973-18982",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225488"}

@inproceedings{bb230501,
        AUTHOR = "Yang, Z.Y. and Kafle, K. and Dernoncourt, F. and Ordonez, V.",
        TITLE = "Improving Visual Grounding by Encouraging Consistent Gradient-Based
Explanations",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "19165-19174",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225489"}

@inproceedings{bb230502,
        AUTHOR = "Wu, Y.M. and Cheng, X.H. and Zhang, R.R. and Cheng, Z. and Zhang, J.",
        TITLE = "EDA: Explicit Text-Decoupling and Dense Alignment for 3D Visual
Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "19231-19242",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225490"}

@inproceedings{bb230503,
        AUTHOR = "Li, M.Z. and Wang, H. and Zhang, W.Q. and Miao, J.X. and Zhao, Z. and Zhang, S.Y. and Ji, W. and Wu, F.",
        TITLE = "WINNER: Weakly-supervised hIerarchical decompositioN and aligNment
for spatio-tEmporal video gRounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23090-23099",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225491"}

@inproceedings{bb230504,
        AUTHOR = "Lin, Z.H. and Tan, C.L. and Hu, J.F. and Jin, Z. and Ye, T. and Zheng, W.S.",
        TITLE = "Collaborative Static and Dynamic Vision-Language Streams for
Spatio-Temporal Video Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23100-23109",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225492"}

@inproceedings{bb230505,
        AUTHOR = "Yang, L. and Kong, Q. and Yang, H.K. and Kehl, W. and Sato, Y. and Kobori, N.",
        TITLE = "DeCo: Decomposition and Reconstruction for Compositional Temporal
Grounding via Coarse-to-Fine Contrastive Ranking",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23130-23140",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225493"}

@inproceedings{bb230506,
        AUTHOR = "Zhou, L. and Zhou, Z. and Mao, K. and He, Z.Y.",
        TITLE = "Joint Visual Grounding and Tracking with Natural Language
Specification",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23151-23160",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225494"}

@inproceedings{bb230507,
        AUTHOR = "Devaraj, C. and Fermuller, C. and Aloimonos, Y.F.",
        TITLE = "Incorporating Visual Grounding In GCN For Zero-shot Learning Of Human
Object Interaction Actions",
        BOOKTITLE = L3D-IVU23,
        YEAR = "2023",
        PAGES = "5008-5017",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225495"}

@inproceedings{bb230508,
        AUTHOR = "Fang, X. and Liu, D.Z. and Zhou, P. and Nan, G.S.",
        TITLE = "You Can Ground Earlier than See: An Effective and Efficient Pipeline
for Temporal Sentence Grounding in Compressed Videos",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "2448-2460",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225496"}

@inproceedings{bb230509,
        AUTHOR = "Fu, T.J. and Li, L.J. and Gan, Z. and Lin, K. and Wang, W.Y. and Wang, L.J. and Liu, Z.C.",
        TITLE = "An Empirical Study of End-to-End Video-Language Transformers with
Masked Visual Modeling",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "22898-22909",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225497"}

@inproceedings{bb230510,
        AUTHOR = "Li, L.J. and Gan, Z. and Lin, K. and Lin, C.C. and Liu, Z.C. and Liu, C. and Wang, L.J.",
        TITLE = "LAVENDER: Unifying Video-Language Understanding as Masked Language
Modeling",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23119-23129",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225498"}

@inproceedings{bb230511,
        AUTHOR = "Dong, J.X. and Yin, Z.Z.",
        TITLE = "Boundary-aware Temporal Sentence Grounding with Adaptive Proposal
Refinement",
        BOOKTITLE = ACCV22,
        YEAR = "2022",
        PAGES = "IV:641-657",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225499"}

@inproceedings{bb230512,
        AUTHOR = "Gao, Y.Z. and Lu, Z.W.",
        TITLE = "SST-VLM: Sparse Sampling-twice Inspired Video-language Model",
        BOOKTITLE = ACCV22,
        YEAR = "2022",
        PAGES = "IV:537-553",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225500"}

@inproceedings{bb230513,
        AUTHOR = "Pacheco Ortega, A. and Mayol Cuervas, W.",
        TITLE = "One-shot Learning for Human Affordance Detection",
        BOOKTITLE = CVMeta22,
        YEAR = "2022",
        PAGES = "758-766",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225501"}

@inproceedings{bb230514,
        AUTHOR = "Ho, C.H. and Appalaraju, S. and Jasani, B. and Manmatha, R. and Vasconcelos, N.M.",
        TITLE = "YORO - Lightweight End to End Visual Grounding",
        BOOKTITLE = CMMP22,
        YEAR = "2022",
        PAGES = "3-23",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225502"}

@inproceedings{bb230515,
        AUTHOR = "Kim, D. and Park, J. and Lee, J.Y. and Park, S. and Sohn, K.H.",
        TITLE = "Language-free Training for Zero-shot Video Grounding",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "2538-2547",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225503"}

@inproceedings{bb230516,
        AUTHOR = "Le, T.M. and Le, V. and Gupta, S.I. and Venkatesh, S. and Tran, T.",
        TITLE = "Guiding Visual Question Answering with Attention Priors",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "4370-4379",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225504"}

@inproceedings{bb230517,
        AUTHOR = "Chou, S.H. and Fan, Z.C. and Little, J.J. and Sigal, L.",
        TITLE = "Semi-Supervised Grounding Alignment for Multi-Modal Feature Learning",
        BOOKTITLE = CRV22,
        YEAR = "2022",
        PAGES = "48-57",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225505"}

@inproceedings{bb230518,
        AUTHOR = "Gupta, K. and Gautam, D. and Mamidi, R.",
        TITLE = "cViL: Cross-Lingual Training of Vision-Language Models using
Knowledge Distillation",
        BOOKTITLE = "ICPR22",
        YEAR = "2022",
        PAGES = "1734-1741",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225506"}

@inproceedings{bb230519,
        AUTHOR = "Chen, D.Z.Y. and Wu, Q.R. and Nießner, M. and Chang, A.X.",
        TITLE = "D 3 Net: A Unified Speaker-Listener Architecture for
3D Dense Captioning and Visual Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXII:487-505",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225507"}

@inproceedings{bb230520,
        AUTHOR = "Parcalabescu, L. and Frank, A.",
        TITLE = "Exploring Phrase Grounding without Training: Contextualisation and
Extension to Text-Based Image Retrieval",
        BOOKTITLE = MULWS20,
        YEAR = "2020",
        PAGES = "4137-4146",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225508"}

@inproceedings{bb230521,
        AUTHOR = "Tung, H. and Harley, A.W. and Huang, L. and Fragkiadaki, K.",
        TITLE = "Reward Learning from Narrated Demonstrations",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "7004-7013",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225509"}

@inproceedings{bb230522,
        AUTHOR = "Cohen, N. and Gal, R. and Meirom, E.A. and Chechik, G. and Atzmon, Y.",
        TITLE = "'This Is My Unicorn, Fluffy':
Personalizing Frozen Vision-Language Representations",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XX:558-577",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225510"}

@inproceedings{bb230523,
        AUTHOR = "Lee, J.H. and Kang, J.W.",
        TITLE = "Relation Enhanced Vision Language Pre-Training",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "2286-2290",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225511"}

@inproceedings{bb230524,
        AUTHOR = "Khan, Z. and Kumar, B.G.V. and Yu, X. and Schulter, S. and Chandraker, M. and Fu, Y.",
        TITLE = "Single-Stream Multi-level Alignment for Vision-Language Pretraining",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:735-751",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225512"}

@inproceedings{bb230525,
        AUTHOR = "Wang, R. and Zhao, H. and Gao, Y.",
        TITLE = "CYBORGS: Contrastively Bootstrapping Object Representations by
Grounding in Segmentation",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXI:260-277",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225513"}

@inproceedings{bb230526,
        AUTHOR = "Yang, Z.Y. and Gan, Z. and Wang, J.F. and Hu, X.W. and Ahmed, F. and Liu, Z.C. and Lu, Y. and Wang, L.J.",
        TITLE = "UniTAB: Unifying Text and Box Outputs for Grounded Vision-Language
Modeling",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:521-539",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225514"}

@inproceedings{bb230527,
        AUTHOR = "Li, H. and Wei, P. and Li, J.P. and Ma, Z. and Shang, J. and Zheng, N.N.",
        TITLE = "Asymmetric Relation Consistency Reasoning for Video Relation Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:125-141",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225515"}

@inproceedings{bb230528,
        AUTHOR = "Dvornik, N. and Hadji, I. and Pham, H. and Bhatt, D. and Martinez, B. and Fazly, A. and Jepson, A.D.",
        TITLE = "Flow Graph to Video Grounding for Weakly-Supervised Multi-step
Localization",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:319-335",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225516"}

@inproceedings{bb230529,
        AUTHOR = "Qu, M.X. and Wu, Y. and Liu, W. and Gong, Q.Q. and Liang, X.D. and Russakovsky, O. and Zhao, Y. and Wei, Y.C.",
        TITLE = "SiRi: A Simple Selective Retraining Mechanism for Transformer-Based
Visual Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:546-562",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225517"}

@inproceedings{bb230530,
        AUTHOR = "Zhu, C.Y. and Zhou, Y. and Shen, Y.H. and Luo, G. and Pan, X.J. and Chen, M.B.L.C. and Cao, L.J. and Sun, X.S. and Ji, R.R.",
        TITLE = "SeqTR: A Simple Yet Universal Network for Visual Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:598-615",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225518"}

@inproceedings{bb230531,
        AUTHOR = "Khan, A.U. and Kuehne, H. and Gan, C. and da Vitoria Lobo, N. and Shah, M.",
        TITLE = "Weakly Supervised Grounding for VQA in Vision-Language Transformers",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:652-670",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225519"}

@inproceedings{bb230532,
        AUTHOR = "Hao, J.C. and Sun, H.F. and Ren, P.F. and Wang, J.Y. and Qi, Q. and Liao, J.X.",
        TITLE = "Can Shuffling Video Benefit Temporal Bias Problem: A Novel Training
Framework for Temporal Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:130-147",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225520"}

@inproceedings{bb230533,
        AUTHOR = "Jain, A. and Gkanatsios, N. and Mediratta, I. and Fragkiadaki, K.",
        TITLE = "Bottom Up Top Down Detection Transformers for Language Grounding in
Images and Point Clouds",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:417-433",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225521"}

@inproceedings{bb230534,
        AUTHOR = "Heisler, M. and Banitalebi Dehkordi, A. and Zhang, Y.",
        TITLE = "SemAug: Semantically Meaningful Image Augmentations for Object
Detection Through Language Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:610-626",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225522"}

@inproceedings{bb230535,
        AUTHOR = "Min, S. and Park, N. and Kim, S. and Park, S.H. and Kim, J.",
        TITLE = "Grounding Visual Representations with Texts for Domain Generalization",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:37-53",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225523"}

@inproceedings{bb230536,
        AUTHOR = "Wang, J. and Wu, H.Y. and Chen, J.C. and Shuai, H.H. and Cheng, W.H.",
        TITLE = "Residual Graph Attention Network and Expression-Respect Data
Augmentation Aided Visual Grounding",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "326-330",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225524"}

@inproceedings{bb230537,
        AUTHOR = "Xiong, Z. and Liu, D. and Zhou, P.",
        TITLE = "Gaussian Kernel-Based Cross Modal Network for Spatio-Temporal Video
Grounding",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "2481-2485",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225525"}

@inproceedings{bb230538,
        AUTHOR = "Alaniz, S. and Federici, M. and Akata, Z.",
        TITLE = "Compositional Mixture Representations for Vision and Text",
        BOOKTITLE = L3D-IVU22,
        YEAR = "2022",
        PAGES = "4201-4210",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225526"}

@inproceedings{bb230539,
        AUTHOR = "Cho, J. and Yoon, Y. and Kwak, S.",
        TITLE = "Collaborative Transformers for Grounded Situation Recognition",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "19627-19636",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225527"}

@inproceedings{bb230540,
        AUTHOR = "Singh, A. and Hu, R.H. and Goswami, V. and Couairon, G. and Galuba, W. and Rohrbach, M. and Kiela, D.",
        TITLE = "FLAVA: A Foundational Language And Vision Alignment Model",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15617-15629",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225528"}

@inproceedings{bb230541,
        AUTHOR = "Saini, N. and Pham, K. and Shrivastava, A.",
        TITLE = "Disentangling Visual Embeddings for Attributes and Objects",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "13648-13657",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225529"}

@inproceedings{bb230542,
        AUTHOR = "Ge, Y.Y. and Ge, Y.X. and Liu, X.H. and Wang, J.P. and Wu, J.P. and Shan, Y. and Qie, X. and Luo, P.",
        TITLE = "MILES: Visual BERT Pre-training with Injected Language Semantics for
Video-Text Retrieval",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:691-708",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225530"}

@inproceedings{bb230543,
        AUTHOR = "Wang, A.J.P. and Ge, Y.X. and Cai, G. and Yan, R. and Lin, X.D. and Shan, Y. and Qie, X. and Shou, M.Z.",
        TITLE = "Object-aware Video-language Pre-training for Retrieval",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "3303-3312",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225531"}

@inproceedings{bb230544,
        AUTHOR = "Li, D.X. and Li, J.N. and Li, H.D. and Niebles, J.C. and Hoi, S.C.H.",
        TITLE = "Align and Prompt: Video-and-Language Pre-training with Entity Prompts",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "4943-4953",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225532"}

@inproceedings{bb230545,
        AUTHOR = "Xue, H.W. and Hang, T. and Zeng, Y.H. and Sun, Y.C. and Liu, B. and Yang, H. and Fu, J.L. and Guo, B.N.",
        TITLE = "Advancing High-Resolution Video-Language Representation with
Large-Scale Video Transcriptions",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5026-5035",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225533"}

@inproceedings{bb230546,
        AUTHOR = "Sammani, F. and Mukherjee, T. and Deligiannis, N.",
        TITLE = "NLX-GPT: A Model for Natural Language Explanations in Vision and
Vision-Language Tasks",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "8312-8322",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225534"}

@inproceedings{bb230547,
        AUTHOR = "Lin, B.Q. and Zhu, Y. and Chen, Z.C. and Liang, X. and Liu, J.Z. and Liang, X.D.",
        TITLE = "ADAPT: Vision-Language Navigation with Modality-Aligned Action
Prompts",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15375-15385",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225535"}

@inproceedings{bb230548,
        AUTHOR = "Dou, Z.Y. and Xu, Y.C. and Gan, Z. and Wang, J.F. and Wang, S.H. and Wang, L.J. and Zhu, C.G. and Zhang, P.C. and Yuan, L. and Peng, N. and Liu, Z.C. and Zeng, M.",
        TITLE = "An Empirical Study of Training End-to-End Vision-and-Language
Transformers",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "18145-18155",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225536"}

@inproceedings{bb230549,
        AUTHOR = "Xu, Z.P. and Lin, T.W. and Tang, H. and Li, F. and He, D.L. and Sebe, N. and Timofte, R. and Van Gool, L.J. and Ding, E.",
        TITLE = "Predict, Prevent, and Evaluate: Disentangled Text-Driven Image
Manipulation Empowered by Pre-Trained Vision-Language Model",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "18208-18217",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225537"}

@inproceedings{bb230550,
        AUTHOR = "Du, Y. and Wei, F.Y. and Zhang, Z.H. and Shi, M.J. and Gao, Y. and Li, G.Q.",
        TITLE = "Learning to Prompt for Open-Vocabulary Object Detection with
Vision-Language Model",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "14064-14073",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225538"}

@inproceedings{bb230551,
        AUTHOR = "Chang, Y.S. and Cao, G.H. and Narang, M. and Gao, J.F. and Suzuki, H. and Bisk, Y.",
        TITLE = "WebQA: Multihop and Multimodal QA",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16474-16483",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225539"}

@inproceedings{bb230552,
        AUTHOR = "Zellers, R. and Lu, J. and Lu, X.M. and Yu, Y. and Zhao, Y.P. and Salehi, M. and Kusupati, A. and Hessel, J. and Farhadi, A. and Choi, Y.",
        TITLE = "MERLOT RESERVE:
Neural Script Knowledge through Vision and Language and Sound",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16354-16366",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225540"}

@inproceedings{bb230553,
        AUTHOR = "Gupta, T. and Kamath, A. and Kembhavi, A. and Hoiem, D.",
        TITLE = "Towards General Purpose Vision Systems:
An End-to-End Task-Agnostic Vision-Language Architecture",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16378-16388",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225541"}

@inproceedings{bb230554,
        AUTHOR = "Suris, D. and Epstein, D. and Vondrick, C.",
        TITLE = "Globetrotter: Connecting Languages by Connecting Images",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16453-16463",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225542"}

@inproceedings{bb230555,
        AUTHOR = "Zhu, H.D. and Sadhu, A. and Zheng, Z.H. and Nevatia, R.",
        TITLE = "Utilizing Every Image Object for Semi-supervised Phrase Grounding",
        BOOKTITLE = WACV21,
        YEAR = "2021",
        PAGES = "2209-2218",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225543"}

@inproceedings{bb230556,
        AUTHOR = "Sung, Y.L. and Cho, J. and Bansal, M.",
        TITLE = "VL-ADAPTER: Parameter-Efficient Transfer Learning for
Vision-and-Language Tasks",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5217-5227",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225544"}

@inproceedings{bb230557,
        AUTHOR = "Wu, D.M. and Dong, X.P. and Shao, L. and Shen, J.B.",
        TITLE = "Multi-Level Representation Learning with Semantic Alignment for
Referring Video Object Segmentation",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "4986-4995",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225545"}

@inproceedings{bb230558,
        AUTHOR = "Gao, K. and Chen, L. and Niu, Y. and Shao, J. and Xiao, J.",
        TITLE = "Classification-Then-Grounding: Reformulating Video Scene Graphs as
Temporal Bipartite Graphs",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "19475-19484",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225546"}

@inproceedings{bb230559,
        AUTHOR = "Kesen, I. and Can, O.A. and Erdem, E. and Erdem, A. and Yuret, D.",
        TITLE = "Modulating Bottom-Up and Top-Down Visual Processing via
Language-Conditional Filters",
        BOOKTITLE = MULA22,
        YEAR = "2022",
        PAGES = "4609-4619",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225547"}

@inproceedings{bb230560,
        AUTHOR = "Nebbia, G. and Kovashka, A.",
        TITLE = "Doubling down: sparse grounding with an additional, almost-matching
caption for detection-oriented multimodal pretraining",
        BOOKTITLE = MULA22,
        YEAR = "2022",
        PAGES = "4641-4650",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225548"}

@inproceedings{bb230561,
        AUTHOR = "Ye, J. and Tian, J.F. and Yan, M. and Yang, X.S. and Wang, X. and Zhang, J. and He, L. and Lin, X.",
        TITLE = "Shifting More Attention to Visual Backbone: Query-modulated
Refinement Networks for End-to-End Visual Grounding",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15481-15491",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225549"}

@inproceedings{bb230562,
        AUTHOR = "Jiang, H.J. and Lin, Y.Z. and Han, D.C. and Song, S. and Huang, G.",
        TITLE = "Pseudo-Q: Generating Pseudo Language Queries for Visual Grounding",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15492-15502",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225550"}

@inproceedings{bb230563,
        AUTHOR = "Huang, S. and Chen, Y.L. and Jia, J.Y. and Wang, L.W.",
        TITLE = "Multi-View Transformer for 3D Visual Grounding",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15503-15512",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225551"}

@inproceedings{bb230564,
        AUTHOR = "Chen, S. and Li, B.",
        TITLE = "Multi-Modal Dynamic Graph Transformer for Visual Grounding",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15513-15522",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225552"}

@inproceedings{bb230565,
        AUTHOR = "Mavroudi, E. and Vidal, R.",
        TITLE = "Weakly-Supervised Generation and Grounding of Visual Descriptions
with Conditional Generative Models",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15523-15533",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225553"}

@inproceedings{bb230566,
        AUTHOR = "Chen, S. and Zhao, Q.",
        TITLE = "REX: Reasoning-aware and Grounded Explanation",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15565-15574",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225554"}

@inproceedings{bb230567,
        AUTHOR = "Lou, C. and Han, W.J. and Lin, Y. and Zheng, Z.L.",
        TITLE = "Unsupervised Vision-Language Parsing: Seamlessly Bridging Visual
Scene Graphs with Language Structures via Dependency Relationships",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15586-15595",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225555"}

@inproceedings{bb230568,
        AUTHOR = "Luo, J.Y. and Fu, J. and Kong, X. and Gao, C. and Ren, H.B. and Shen, H. and Xia, H.X. and Liu, S.",
        TITLE = "3D-SPS: Single-Stage 3D Visual Grounding via Referred Point
Progressive Selection",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16433-16442",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225556"}

@inproceedings{bb230569,
        AUTHOR = "Cai, D. and Zhao, L.C. and Zhang, J. and Sheng, L. and Xu, D.",
        TITLE = "3DJCG: A Unified Framework for Joint Dense Captioning and Visual
Grounding on 3D Point Clouds",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16443-16452",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225557"}

@inproceedings{bb230570,
        AUTHOR = "Luo, H.C. and Zhai, W. and Zhang, J. and Cao, Y. and Tao, D.C.",
        TITLE = "Learning Affordance Grounding from Exocentric Images",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "2242-2251",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225558"}

@inproceedings{bb230571,
        AUTHOR = "Jiang, X. and Xu, X. and Zhang, J. and Shen, F.M. and Cao, Z. and Shen, H.T.",
        TITLE = "Semi-supervised Video Paragraph Grounding with Contrastive Encoder",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "2456-2465",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225559"}

@inproceedings{bb230572,
        AUTHOR = "Yu, W. and Chen, W.X. and Yin, S. and Easterbrook, S. and Garg, A.",
        TITLE = "Modular Action Concept Grounding in Semantic Video Prediction",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "3595-3604",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225560"}

@inproceedings{bb230573,
        AUTHOR = "Soldan, M. and Pardo, A. and Alcazar, J.L. and Heilbron, F.C. and Zhao, C. and Giancola, S. and Ghanem, B.",
        TITLE = "MAD: A Scalable Dataset for Language Grounding in Videos from Movie
Audio Descriptions",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5016-5025",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225561"}

@inproceedings{bb230574,
        AUTHOR = "Yang, L. and Xu, Y. and Yuan, C.F. and Liu, W. and Li, B. and Hu, W.M.",
        TITLE = "Improving Visual Grounding with Visual-Linguistic Verification and
Iterative Reasoning",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "9489-9498",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225562"}

@inproceedings{bb230575,
        AUTHOR = "Li, L.H. and Zhang, P.C. and Zhang, H.T. and Yang, J.W. and Li, C.Y. and Zhong, Y. and Wang, L.J. and Yuan, L. and Zhang, L. and Hwang, J.N. and Chang, K.W. and Gao, J.F.",
        TITLE = "Grounded Language-Image Pre-training",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "10955-10965",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225563"}

@inproceedings{bb230576,
        AUTHOR = "Li, Y.C. and Wang, X. and Xiao, J.B. and Ji, W. and Chua, T.S.",
        TITLE = "Invariant Grounding for Video Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "2918-2927",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225564"}

@inproceedings{bb230577,
        AUTHOR = "Yang, Z.Y. and Zhang, S.Y. and Wang, L.W. and Luo, J.B.",
        TITLE = "SAT: 2D Semantics Assisted Training for 3D Visual Grounding",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1836-1846",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225565"}

@inproceedings{bb230578,
        AUTHOR = "Chen, J.W. and Golisano, Y.K.",
        TITLE = "Explainable Video Entailment with Grounded Visual Evidence",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "2001-2010",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225566"}

@inproceedings{bb230579,
        AUTHOR = "Zhao, L.C. and Cai, D. and Sheng, L. and Xu, D.",
        TITLE = "3DVG-Transformer: Relation Modeling for Visual Grounding on Point
Clouds",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "2908-2917",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225567"}

@inproceedings{bb230580,
        AUTHOR = "Feng, M. and Li, Z. and Li, Q. and Zhang, L. and Zhang, X. and Zhu, G.M. and Zhang, H. and Wang, Y. and Mian, A.",
        TITLE = "Free-form Description Guided 3D Visual Graph Network for Object
Grounding in Point Cloud",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "3702-3711",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225568"}

@inproceedings{bb230581,
        AUTHOR = "Ding, X.P. and Wang, N.N. and Zhang, S.W. and Cheng, D. and Li, X.M. and Huang, Z.Y. and Tang, M.Q. and Gao, X.B.",
        TITLE = "Support-Set Based Cross-Supervision for Video Grounding",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "11553-11562",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225569"}

@inproceedings{bb230582,
        AUTHOR = "Khandelwal, S. and Suhail, M. and Sigal, L.",
        TITLE = "Segmentation-grounded Scene Graph Generation",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "15859-15869",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225570"}

@inproceedings{bb230583,
        AUTHOR = "Patel, S. and Wani, S. and Jain, U. and Schwing, A. and Lazebnik, S. and Savva, M. and Chang, A.X.",
        TITLE = "Interpretation of Emergent Communication in Heterogeneous
Collaborative Embodied Agents",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "15993-15943",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225571"}

@inproceedings{bb230584,
        AUTHOR = "Shi, J. and Zhong, Y. and Xu, N. and Li, Y. and Xu, C.L.",
        TITLE = "A Simple Baseline for Weakly-Supervised Scene Graph Generation",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "16373-16382",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225572"}

@inproceedings{bb230585,
        AUTHOR = "Su, R. and Yu, Q. and Xu, D.",
        TITLE = "STVGBert: A Visual-linguistic Transformer based Framework for
Spatio-temporal Video Grounding",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1513-1522",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225573"}

@inproceedings{bb230586,
        AUTHOR = "Cui, C.Y.Q. and Khandelwal, A. and Artzi, Y. and Snavely, N. and Averbuch Elor, H.",
        TITLE = "Who's Waldo? Linking People Across Text and Images",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1354-1364",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225574"}

@inproceedings{bb230587,
        AUTHOR = "Gonzalez, C. and Ayobi, N. and Hernandez, I. and Hernandez, J. and Pont Tuset, J. and Arbelaez, P.",
        TITLE = "Panoptic Narrative Grounding",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1344-1353",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225575"}

@inproceedings{bb230588,
        AUTHOR = "Hong, Y. and Li, Q. and Zhu, S.C. and Huang, S.Y.",
        TITLE = "VLGrammar: Grounded Grammar Induction of Vision and Language",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1645-1654",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225576"}

@inproceedings{bb230589,
        AUTHOR = "Yuan, Z.H. and Yan, X. and Liao, Y.H. and Zhang, R.M. and Wang, S. and Li, Z. and Cui, S.G.",
        TITLE = "InstanceRefer: Cooperative Holistic Understanding for Visual
Grounding on Point Clouds through Instance Multi-level Contextual
Referring",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1771-1780",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225577"}

@inproceedings{bb230590,
        AUTHOR = "Soldan, M. and Xu, M.M. and Qu, S. and Tegner, J. and Ghanem, B.",
        TITLE = "VLG-Net: Video-Language Graph Matching Network for Video Grounding",
        BOOKTITLE = CVEU21,
        YEAR = "2021",
        PAGES = "3217-3227",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225578"}

@inproceedings{bb230591,
        AUTHOR = "Lu, X.P. and Fan, Z. and Wang, Y. and Oh, J. and Rose, C.P.",
        TITLE = "Localize, Group, and Select: Boosting Text-VQA by Scene Text Modeling",
        BOOKTITLE = XSAnim21,
        YEAR = "2021",
        PAGES = "2631-2639",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225579"}

@inproceedings{bb230592,
        AUTHOR = "Tian, Y.P. and Hu, D. and Xu, C.L.",
        TITLE = "Cyclic Co-Learning of Sounding Object Visual Grounding and Sound
Separation",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "2744-2753",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225580"}

@inproceedings{bb230593,
        AUTHOR = "Nan, G.S. and Qiao, R. and Xiao, Y. and Liu, J. and Leng, S.C. and Zhang, H. and Lu, W.",
        TITLE = "Interventional Video Grounding with Dual Contrastive Learning",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "2764-2774",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225581"}

@inproceedings{bb230594,
        AUTHOR = "Zhao, Y. and Zhao, Z. and Zhang, Z. and Lin, Z.J.",
        TITLE = "Cascaded Prediction Network via Segment Tree for Temporal Video
Grounding",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "4195-4204",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225582"}

@inproceedings{bb230595,
        AUTHOR = "Liu, Y.F. and Wan, B. and Ma, L. and He, X.M.",
        TITLE = "Relation-aware Instance Refinement for Weakly Supervised Visual
Grounding",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "5608-5617",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225583"}

@inproceedings{bb230596,
        AUTHOR = "Liu, H.L. and Lin, A. and Han, X.G. and Yang, L. and Yu, Y.Z. and Cui, S.G.",
        TITLE = "Refer-it-in-RGBD: A Bottom-up Approach for 3D Visual Grounding in
RGBD Images",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "6028-6037",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225584"}

@inproceedings{bb230597,
        AUTHOR = "Lin, X.R. and Li, G.B. and Yu, Y.Z.",
        TITLE = "Scene-Intuitive Agent for Remote Embodied Visual Grounding",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "7032-7041",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225585"}

@inproceedings{bb230598,
        AUTHOR = "Liu, D.Z. and Qu, X.Y. and Dong, J.F. and Zhou, P. and Cheng, Y. and Wei, W. and Xu, Z. and Xie, Y.",
        TITLE = "Context-aware Biaffine Localizing Network for Temporal Sentence
Grounding",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "11230-11239",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225586"}

@inproceedings{bb230599,
        AUTHOR = "Meng, Z.H. and Yu, L.C. and Zhang, N. and Berg, T. and Damavandi, B. and Singh, V. and Bearman, A.",
        TITLE = "Connecting What to Say With Where to Look by Modeling Human Attention
Traces",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "12674-12683",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT225587"}

Last update:May 14, 2025 at 16:05:19