@inproceedings{bb213200,
        AUTHOR = "Liu, F. and Xiang, T. and Hospedales, T.M. and Yang, W.K. and Sun, C.Y.",
        TITLE = "iVQA: Inverse Visual Question Answering",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "8611-8619",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208278"}

@article{bb213201,
        AUTHOR = "Patil, C. and Patwardhan, M.",
        TITLE = "Visual Question Generation: The State of the Art",
        JOURNAL = Surveys,
        VOLUME = "53",
        YEAR = "2020",
        NUMBER = "3",
        MONTH = "May",
        PAGES = "xx-yy",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208279"}

@article{bb213202,
        AUTHOR = "He, F.J. and Wang, Y.X. and Miao, X.L. and Sun, X.",
        TITLE = "Interpretable visual reasoning: A survey",
        JOURNAL = IVC,
        VOLUME = "112",
        YEAR = "2021",
        PAGES = "104194",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208280"}

@article{bb213203,
        AUTHOR = "Sharma, H. and Jalal, A.S.",
        TITLE = "A survey of methods, datasets and evaluation metrics for visual
question answering",
        JOURNAL = IVC,
        VOLUME = "116",
        YEAR = "2021",
        PAGES = "104327",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208281"}

@article{bb213204,
        AUTHOR = "Yang, L. and Jiang, H. and Song, Q. and Guo, J.",
        TITLE = "A Survey on Long-Tailed Visual Recognition",
        JOURNAL = IJCV,
        VOLUME = "130",
        YEAR = "2022",
        NUMBER = "7",
        MONTH = "July",
        PAGES = "1837-1872",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208282"}

@article{bb213205,
        AUTHOR = "Zhao, W.L. and Rao, Y.M. and Tang, Y.S. and Zhou, J. and Lu, J.W.",
        TITLE = "VideoABC: A Real-World Video Dataset for Abductive Visual Reasoning",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "6048-6061",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208283"}

@article{bb213206,
        AUTHOR = "Lahouti, F. and Kostina, V. and Hassibi, B.",
        TITLE = "How to Query an Oracle? Efficient Strategies to Label Data",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "7597-7609",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208284"}

@inproceedings{bb213207,
        AUTHOR = "Singh, M. and Patvardhan, C. and Lakshmi, C.V.",
        TITLE = "Does ChatGPT Spell the End of Automatic Question Generation Research?",
        BOOKTITLE = ICCVMI23,
        YEAR = "2023",
        PAGES = "1-6",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208285"}

@inproceedings{bb213208,
        AUTHOR = "Zhu, L. and Ning, R. and Li, J. and Xin, C.S. and Wu, H.Y.",
        TITLE = "Most and Least Retrievable Images in Visual-Language Query Systems",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:1-18",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208286"}

@inproceedings{bb213209,
        AUTHOR = "Salewski, L. and Emde, C. and Do, V. and Akata, Z. and Lukasiewicz, T.",
        TITLE = "e-ViL: A Dataset and Benchmark for Natural Language Explanations in
Vision-Language Tasks",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1224-1234",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208287"}

@inproceedings{bb213210,
        AUTHOR = "Gupta, V. and Patro, B.N. and Parihar, H. and Namboodiri, V.P.",
        TITLE = "VQuAD: Video Question Answering Diagnostic Dataset",
        BOOKTITLE = Novelty22,
        YEAR = "2022",
        PAGES = "282-291",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208288"}

@inproceedings{bb213211,
        AUTHOR = "Nishimura, T. and Sakoda, K. and Hashimoto, A. and Ushiku, Y. and Tanaka, N. and Ono, F. and Kameko, H. and Mori, S.",
        TITLE = "Egocentric Biochemical Video-and-Language Dataset",
        BOOKTITLE = CLVL21,
        YEAR = "2021",
        PAGES = "3122-3126",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208289"}

@inproceedings{bb213212,
        AUTHOR = "Zhang, M. and Maidment, T. and Diab, A. and Kovashka, A. and Hwa, R.",
        TITLE = "Domain-robust VQA with diverse datasets and methods but no target
labels",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "7042-7052",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208290"}

@inproceedings{bb213213,
        AUTHOR = "Mathew, M. and Karatzas, D. and Jawahar, C.V.",
        TITLE = "DocVQA: A Dataset for VQA on Document Images",
        BOOKTITLE = WACV21,
        YEAR = "2021",
        PAGES = "2199-2208",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208291"}

@inproceedings{bb213214,
        AUTHOR = "Patel, D. and Parikh, R. and Shastri, Y.",
        TITLE = "Recent Advances in Video Question Answering:
A Review of Datasets and Methods",
        BOOKTITLE = VTIUR20,
        YEAR = "2020",
        PAGES = "339-356",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208292"}

@inproceedings{bb213215,
        AUTHOR = "Fan, C.",
        TITLE = "EgoVQA: An Egocentric Video Question Answering Benchmark Dataset",
        BOOKTITLE = EPIC19,
        YEAR = "2019",
        PAGES = "4359-4366",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208293"}

@inproceedings{bb213216,
        AUTHOR = "Hudson, D.A. and Manning, C.D.",
        TITLE = "GQA: A New Dataset for Real-World Visual Reasoning and Compositional
Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "6693-6702",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208294"}

@inproceedings{bb213217,
        AUTHOR = "Yang, G.Y.R. and Ganichev, I. and Wang, X.J. and Shlens, J. and Sussillo, D.",
        TITLE = "A Dataset and Architecture for Visual Reasoning with a Working Memory",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "X: 729-745",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208295"}

@inproceedings{bb213218,
        AUTHOR = "Gan, C. and Li, Y. and Li, H. and Sun, C. and Gong, B.",
        TITLE = "VQS: Linking Segmentations to Questions and Answers for Supervised
Attention in VQA and Question-Focused Semantic Segmentation",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "1829-1838",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208296"}

@inproceedings{bb213219,
        AUTHOR = "Maharaj, T. and Ballas, N. and Rohrbach, A. and Courville, A. and Pal, C.",
        TITLE = "A Dataset and Exploration of Models for Understanding Video Data
through Fill-in-the-Blank Question-Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "7359-7368",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208297"}

@article{bb213220,
        AUTHOR = "Das, A. and Kottur, S. and Gupta, K. and Singh, A. and Yadav, D. and Lee, S. and Moura, J.M.F. and Parikh, D. and Batra, D.",
        TITLE = "Visual Dialog",
        JOURNAL = PAMI,
        VOLUME = "41",
        YEAR = "2019",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "1242-1256",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208298"}

@article{bb213221,
        AUTHOR = "Zhao, Z. and Zhang, Z. and Jiang, X.H. and Cai, D.",
        TITLE = "Multi-Turn Video Question Answering via Hierarchical Attention
Context Reinforced Networks",
        JOURNAL = IP,
        VOLUME = "28",
        YEAR = "2019",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "3860-3872",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208299"}

@article{bb213222,
        AUTHOR = "Gu, M. and Zhao, Z. and Jin, W. and Cai, D. and Wu, F.",
        TITLE = "Video Dialog via Multi-Grained Convolutional Self-Attention Context
Multi-Modal Networks",
        JOURNAL = CirSysVideo,
        VOLUME = "30",
        YEAR = "2020",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "4453-4466",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208300"}

@article{bb213223,
        AUTHOR = "Guo, D. and Wang, H. and Wang, S. and Wang, M.",
        TITLE = "Textual-Visual Reference-Aware Attention Network for Visual Dialog",
        JOURNAL = IP,
        VOLUME = "29",
        YEAR = "2020",
        PAGES = "6655-6666",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208301"}

@article{bb213224,
        AUTHOR = "Patro, B.N. and Anupriy and Namboodiri, V.P.",
        TITLE = "Probabilistic framework for solving visual dialog",
        JOURNAL = PR,
        VOLUME = "110",
        YEAR = "2021",
        PAGES = "107586",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208302"}

@article{bb213225,
        AUTHOR = "Zhao, L. and Lyu, X.Y. and Song, J.K. and Gao, L.L.",
        TITLE = "GuessWhich? Visual dialog with attentive memory network",
        JOURNAL = PR,
        VOLUME = "114",
        YEAR = "2021",
        PAGES = "107823",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208303"}

@article{bb213226,
        AUTHOR = "Jiang, T.L. and Shao, H.L. and Tian, X. and Ji, Y. and Liu, C.P.",
        TITLE = "Aligning vision-language for graph inference in visual dialog",
        JOURNAL = IVC,
        VOLUME = "116",
        YEAR = "2021",
        PAGES = "104316",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208304"}

@article{bb213227,
        AUTHOR = "Guo, D. and Wang, H. and Wang, M.",
        TITLE = "Context-Aware Graph Inference With Knowledge Distillation for Visual
Dialog",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "6056-6073",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208305"}

@inproceedings{bb213228,
        AUTHOR = "Guo, D. and Wang, H. and Zhang, H.W. and Zha, Z.J. and Wang, M.",
        TITLE = "Iterative Context-Aware Graph Inference for Visual Dialog",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10052-10061",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208306"}

@article{bb213229,
        AUTHOR = "Patro, B.N. and Anupriy and Namboodiri, V.P.",
        TITLE = "Explanation vs. attention: A two-player game to obtain attention for
VQA and visual dialog",
        JOURNAL = PR,
        VOLUME = "132",
        YEAR = "2022",
        PAGES = "108898",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208307"}

@article{bb213230,
        AUTHOR = "Zhu, Y. and Wu, Y. and Yang, Y. and Yan, Y.",
        TITLE = "Saying the Unseen: Video Descriptions via Dialog Agents",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "7190-7204",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208308"}

@article{bb213231,
        AUTHOR = "Huang, Y. and Wang, Y.M. and Wang, L.",
        TITLE = "Efficient Image and Sentence Matching",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "2970-2983",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208309"}

@article{bb213232,
        AUTHOR = "Zhao, L. and Li, J.L. and Gao, L.L. and Rao, Y. and Song, J.K. and Shen, H.T.",
        TITLE = "Heterogeneous Knowledge Network for Visual Dialog",
        JOURNAL = CirSysVideo,
        VOLUME = "33",
        YEAR = "2023",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "861-871",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208310"}

@article{bb213233,
        AUTHOR = "Bucinca, Z. and Yemez, Y. and Erzin, E. and Sezgin, M.",
        TITLE = "AffectON: Incorporating Affect Into Dialog Generation",
        JOURNAL = AffCom,
        VOLUME = "14",
        YEAR = "2023",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "823-835",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208311"}

@article{bb213234,
        AUTHOR = "Yu, H. and Ko, Y.J.",
        TITLE = "Enriching the dialogue state tracking model with a asyntactic
discourse graph",
        JOURNAL = PRL,
        VOLUME = "169",
        YEAR = "2023",
        PAGES = "81-86",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208312"}

@article{bb213235,
        AUTHOR = "Wu, Y.X. and Liao, L. and Zhang, G.Y. and Lei, W.Q. and Zhao, G.S. and Qian, X.M. and Chua, T.S.",
        TITLE = "State Graph Reasoning for Multimodal Conversational Recommendation",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "3113-3124",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208313"}

@article{bb213236,
        AUTHOR = "Firdaus, M. and Thangavelu, N. and Ekbal, A. and Bhattacharyya, P.",
        TITLE = "I Enjoy Writing and Playing, Do You?: A Personalized and Emotion
Grounded Dialogue Agent Using Generative Adversarial Network",
        JOURNAL = AffCom,
        VOLUME = "14",
        YEAR = "2023",
        NUMBER = "3",
        MONTH = "July",
        PAGES = "2127-2138",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208314"}

@article{bb213237,
        AUTHOR = "Zhang, Z. and Li, S. and Ji, Y. and Liu, C.P.",
        TITLE = "Infer unseen from seen: Relation regularized zero-shot visual dialog",
        JOURNAL = JVCIR,
        VOLUME = "97",
        YEAR = "2023",
        PAGES = "103961",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208315"}

@article{bb213238,
        AUTHOR = "Qi, Q.S. and Zhang, A. and Liao, Y. and Sun, W.Y. and Wang, Y.L. and Li, X.B. and Liu, S.",
        TITLE = "Simultaneously Training and Compressing Vision-and-Language
Pre-Training Model",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "8194-8203",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208316"}

@article{bb213239,
        AUTHOR = "Liu, A.A. and Huang, C.X. and Xu, N. and Tian, H. and Liu, J. and Zhang, Y.D.",
        TITLE = "Counterfactual Visual Dialog: Robust Commonsense Knowledge Learning
From Unbiased Training",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "1639-1651",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208317"}

@article{bb213240,
        AUTHOR = "Ricci, R. and Bazi, Y. and Melgani, F.",
        TITLE = "Machine-to-Machine Visual Dialoguing with ChatGPT for Enriched
Textual Image Description",
        JOURNAL = RS,
        VOLUME = "16",
        YEAR = "2024",
        NUMBER = "3",
        PAGES = "441",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208318"}

@article{bb213241,
        AUTHOR = "Bulat, A. and Tzimiropoulos, G.",
        TITLE = "Language-Aware Soft Prompting: Text-to-Text Optimization for Few- and
Zero-Shot Adaptation of V&L Models",
        JOURNAL = IJCV,
        VOLUME = "132",
        YEAR = "2024",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "1108-1125",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208319"}

@inproceedings{bb213242,
        AUTHOR = "Bulat, A. and Tzimiropoulos, G.",
        TITLE = "LASP: Text-to-Text Optimization for Language-Aware Soft Prompting of
Vision and Language Models",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23232-23241",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208320"}

@article{bb213243,
        AUTHOR = "Wang, A.J.P. and Zhou, P. and Shou, M.Z. and Yan, S.C.",
        TITLE = "Enhancing Visual Grounding in Vision-Language Pre-Training With
Position-Guided Text Prompts",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "3406-3421",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208321"}

@inproceedings{bb213244,
        AUTHOR = "Wang, A.J.P. and Zhou, P. and Shou, M.Z. and Yan, S.C.",
        TITLE = "Position-Guided Text Prompt for Vision-Language Pre-Training",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23242-23251",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208322"}

@inproceedings{bb213245,
        AUTHOR = "Han, S.J. and Hessel, J. and Dziri, N. and Choi, Y. and Yu, Y.J.",
        TITLE = "Champagne: Learning Real-world Conversation from Large-Scale Web
Videos",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15452-15463",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208323"}

@inproceedings{bb213246,
        AUTHOR = "Oshima, R. and Shinagawa, S. and Tsunashima, H. and Feng, Q. and Morishima, S.",
        TITLE = "Pointing out Human Answer Mistakes in a Goal-Oriented Visual Dialogue",
        BOOKTITLE = VLAR23,
        YEAR = "2023",
        PAGES = "4665-4670",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208324"}

@inproceedings{bb213247,
        AUTHOR = "Ishii, T. and Miura, J. and Hayashi, K.",
        TITLE = "Enhancing Human-Robot Collaborative Object Search through Human
Behavior Observation and Dialog",
        BOOKTITLE = ACVR23,
        YEAR = "2023",
        PAGES = "1841-1848",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208325"}

@inproceedings{bb213248,
        AUTHOR = "Madasu, A. and Lal, V.",
        TITLE = "Is Multimodal Vision Supervision Beneficial to Language?",
        BOOKTITLE = NFVLR23,
        YEAR = "2023",
        PAGES = "2637-2642",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208326"}

@inproceedings{bb213249,
        AUTHOR = "Ashutosh, K. and Girdhar, R. and Torresani, L. and Grauman, K.",
        TITLE = "HierVL: Learning Hierarchical Video-Language Embeddings",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23066-23078",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208327"}

@inproceedings{bb213250,
        AUTHOR = "Smith, J.S. and Cascante Bonilla, P. and Arbelle, A. and Kim, D.H. and Panda, R. and Cox, D. and Yang, D. and Kira, Z. and Feris, R.S. and Karlinsky, L.",
        TITLE = "ConStruct-VL: Data-Free Continual Structured VL Concepts Learning*",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14994-15004",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208328"}

@inproceedings{bb213251,
        AUTHOR = "Chen, Y.X. and Ma, Z.Y. and Zhang, Z.Q. and Qi, Z.A. and Yuan, C.F. and Shan, Y. and Li, B. and Hu, W.M. and Qie, X. and Wu, J.P.",
        TITLE = "ViLEM: Visual-Language Error Modeling for Image-Text Retrieval",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "11018-11027",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208329"}

@inproceedings{bb213252,
        AUTHOR = "Huang, J.J. and Li, Y. and Feng, J.S. and Wu, X.L. and Sun, X.S. and Ji, R.R.",
        TITLE = "Clover: Towards A Unified Video-Language Alignment and Fusion Model",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14856-14866",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208330"}

@inproceedings{bb213253,
        AUTHOR = "Li, C.H. and Li, Z. and Jing, C.C. and Jia, Y.D. and Wu, Y.W.",
        TITLE = "Exploring the Effect of Primitives for Compositional Generalization
in Vision-and-Language",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "19092-19101",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208331"}

@inproceedings{bb213254,
        AUTHOR = "Yao, H.T. and Zhang, R. and Xu, C.S.",
        TITLE = "Visual-Language Prompt Tuning with Knowledge-Guided Context
Optimization",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6757-6767",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208332"}

@inproceedings{bb213255,
        AUTHOR = "Kwon, H. and Song, T. and Jeong, S. and Kim, J. and Jang, J. and Sohn, K.H.",
        TITLE = "Probabilistic Prompt Learning for Dense Prediction",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6768-6777",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208333"}

@inproceedings{bb213256,
        AUTHOR = "Luo, H.C. and Zhai, W. and Zhang, J. and Cao, Y. and Tao, D.C.",
        TITLE = "Leverage Interactive Affinity for Affordance Learning",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6809-6819",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208334"}

@inproceedings{bb213257,
        AUTHOR = "Bagad, P. and Tapaswi, M. and Snoek, C.G.M.",
        TITLE = "Test of Time: Instilling Video-Language Models with a Sense of Time",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "2503-2516",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208335"}

@inproceedings{bb213258,
        AUTHOR = "Kang, G.C. and Kim, S. and Kim, J.H. and Kwak, D.H. and Zhang, B.T.",
        TITLE = "The Dialog Must Go On: Improving Visual Dialog via Generative
Self-Training",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6746-6756",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208336"}

@inproceedings{bb213259,
        AUTHOR = "Bannur, S. and Hyland, S. and Liu, Q. and Perez Garcia, F. and Ilse, M. and Castro, D.C. and Boecking, B. and Sharma, H. and Bouzid, K. and Thieme, A. and Schwaighofer, A. and Wetscherek, M. and Lungren, M.P. and Nori, A. and Alvarez Valle, J. and Oktay, O.",
        TITLE = "Learning to Exploit Temporal Structure for Biomedical Vision-Language
Processing",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "15016-15027",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208337"}

@inproceedings{bb213260,
        AUTHOR = "Srinivasan, T. and Ren, X. and Thomason, J.",
        TITLE = "Curriculum Learning for Data-Efficient Vision-Language Alignment",
        BOOKTITLE = ODRUM23,
        YEAR = "2023",
        PAGES = "5619-5624",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208338"}

@inproceedings{bb213261,
        AUTHOR = "Ibing, M. and Lim, I. and Kobbelt, L.",
        TITLE = "Localized Latent Updates for Fine-Tuning Vision-Language Models",
        BOOKTITLE = ECV23,
        YEAR = "2023",
        PAGES = "4509-4518",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208339"}

@inproceedings{bb213262,
        AUTHOR = "Zhou, Y.T. and Shimada, N.",
        TITLE = "Vision + Language Applications: A Survey",
        BOOKTITLE = GCV23,
        YEAR = "2023",
        PAGES = "826-842",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208340"}

@inproceedings{bb213263,
        AUTHOR = "Parisot, S. and Yang, Y.X. and McDonagh, S.",
        TITLE = "Learning to Name Classes for Vision and Language Models",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23477-23486",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208341"}

@inproceedings{bb213264,
        AUTHOR = "Kim, S. and Jo, D. and Lee, D. and Kim, J.",
        TITLE = "MAGVLT: Masked Generative Vision-and-Language Transformer",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23338-23348",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208342"}

@inproceedings{bb213265,
        AUTHOR = "Ji, Y. and Wang, J.J. and Gong, Y. and Zhang, L. and Zhu, Y. and Wang, H.F. and Zhang, J.X. and Sakai, T. and Yang, Y.",
        TITLE = "MAP: Multimodal Uncertainty-Aware Vision-Language Pre-training Model",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23262-23271",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208343"}

@inproceedings{bb213266,
        AUTHOR = "Zhang, X. and Wang, W. and Chen, Z. and Xu, Y.F. and Zhang, J. and Tao, D.C.",
        TITLE = "CLAMP: Prompt-based Contrastive Learning for Connecting Language and
Animal Pose",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23272-23281",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208344"}

@inproceedings{bb213267,
        AUTHOR = "Wang, T. and Ge, Y.X. and Zheng, F. and Cheng, R. and Shan, Y. and Qie, X. and Luo, P.",
        TITLE = "Accelerating Vision-Language Pretraining with Free Language Modeling",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23161-23170",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208345"}

@inproceedings{bb213268,
        AUTHOR = "Doveh, S. and Arbelle, A. and Harary, S. and Schwartz, E. and Herzig, R. and Giryes, R. and Feris, R.S. and Panda, R. and Ullman, S. and Karlinsky, L.",
        TITLE = "Teaching Structured Vision and Language Concepts to Vision and
Language Models",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "2657-2668",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208346"}

@inproceedings{bb213269,
        AUTHOR = "Chino, A. and Teraoka, T.",
        TITLE = "Relevance-aware Question Generation in Non-task-oriented Dialogue
Systems",
        BOOKTITLE = VAMR23,
        YEAR = "2023",
        PAGES = "344-358",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208347"}

@inproceedings{bb213270,
        AUTHOR = "Tang, Z. and Cho, J. and Lei, J. and Bansal, M.",
        TITLE = "PERCEIVER-VL: Efficient Vision-and-Language Modeling with Iterative
Latent Attention",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "4399-4409",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208348"}

@inproceedings{bb213271,
        AUTHOR = "Tripathi, A. and Mishra, A. and Chakraborty, A.",
        TITLE = "Grounding Scene Graphs on Natural Images via Visio-Lingual Message
Passing",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "4380-4389",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208349"}

@inproceedings{bb213272,
        AUTHOR = "Byun, J. and Hwang, T. and Fu, J.L. and Moon, T.",
        TITLE = "GRIT-VLP: Grouped Mini-batch Sampling for Efficient Vision and Language
Pre-training",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XIX:395-412",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208350"}

@inproceedings{bb213273,
        AUTHOR = "Yan, S.P. and Hong, L. and Xu, H. and Han, J.H. and Tuytelaars, T. and Li, Z.G. and He, X.M.",
        TITLE = "Generative Negative Text Replay for Continual Vision-Language
Pretraining",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:22-38",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208351"}

@inproceedings{bb213274,
        AUTHOR = "Zhang, Y.F. and Jiang, M. and Zhao, Q.",
        TITLE = "New Datasets and Models for Contextual Reasoning in Visual Dialog",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:434-451",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208352"}

@inproceedings{bb213275,
        AUTHOR = "Pham, H.A. and Le, T.M. and Le, V. and Phuong, T.M. and Tran, T.",
        TITLE = "Video Dialog as Conversation About Objects Living in Space-Time",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXIX:710-726",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208353"}

@inproceedings{bb213276,
        AUTHOR = "Zhang, Z.F. and Jiang, T.L. and Liu, C.P. and Ji, Y.",
        TITLE = "Coupling Attention and Convolution for Heuristic Network in Visual
Dialog",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "2896-2900",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208354"}

@inproceedings{bb213277,
        AUTHOR = "Zhang, H.Y. and Li, Y.M. and Zhang, Z.F.",
        TITLE = "Video-Grounded Dialogues with Joint Video and Image Training",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "3903-3907",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208355"}

@inproceedings{bb213278,
        AUTHOR = "Zhang, S. and Jiang, X.Z. and Yang, Z. and Wan, T. and Qin, Z.C.",
        TITLE = "Reasoning with Multi-Structure Commonsense Knowledge in Visual Dialog",
        BOOKTITLE = MULA22,
        YEAR = "2022",
        PAGES = "4599-4608",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208356"}

@inproceedings{bb213279,
        AUTHOR = "Zhu, Y. and Weng, Y. and Zhu, F.D. and Liang, X.D. and Ye, Q.X. and Lu, Y.T. and Jiao, J.B.",
        TITLE = "Self-Motivated Communication Agent for Real-World Vision-Dialog
Navigation",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1574-1583",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208357"}

@inproceedings{bb213280,
        AUTHOR = "Engin, D. and Schnitzler, F. and Duong, N.Q.K. and Avrithis, Y.",
        TITLE = "On the hidden treasure of dialog in video question answering",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "2044-2053",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208358"}

@inproceedings{bb213281,
        AUTHOR = "Matsumori, S. and Shingyouchi, K. and Abe, Y. and Fukuchi, Y. and Sugiura, K. and Imai, M.",
        TITLE = "Unified Questioner Transformer for Descriptive Question Generation in
Goal-Oriented Visual Dialogue",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1878-1887",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208359"}

@inproceedings{bb213282,
        AUTHOR = "Tu, T. and Ping, Q. and Thattai, G. and Tur, G. and Natarajan, P.",
        TITLE = "Learning Better Visual Dialog Agents with Pretrained
Visual-Linguistic Representation",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "5618-5627",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208360"}

@inproceedings{bb213283,
        AUTHOR = "Jiang, T.L. and Ji, Y. and Liu, C.P.",
        TITLE = "Integrating Historical States and Co-attention Mechanism for Visual
Dialog",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "2041-2048",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208361"}

@inproceedings{bb213284,
        AUTHOR = "Nguyen, V.Q. and Suganuma, M. and Okatani, T.",
        TITLE = "Efficient Attention Mechanism for Visual Dialog that Can Handle All the
Interactions Between Multiple Inputs",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XXIV:223-240",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208362"}

@inproceedings{bb213285,
        AUTHOR = "Murahari, V. and Batra, D. and Parikh, D. and Das, A.",
        TITLE = "Large-scale Pretraining for Visual Dialog:
A Simple State-of-the-art Baseline",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XVIII:336-352",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208363"}

@inproceedings{bb213286,
        AUTHOR = "Zhu, Y. and Wu, Y. and Yang, Y. and Yan, Y.",
        TITLE = "Describing Unseen Videos via Multi-Modal Cooperative Dialog Agents",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XXIII:153-169",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208364"}

@inproceedings{bb213287,
        AUTHOR = "Qi, J. and Niu, Y. and Huang, J. and Zhang, H.",
        TITLE = "Two Causal Principles for Improving Visual Dialog",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10857-10866",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208365"}

@inproceedings{bb213288,
        AUTHOR = "Abbasnejad, E. and Teney, D. and Parvaneh, A. and Shi, J. and van den Hengel, A.J.",
        TITLE = "Counterfactual Vision and Language Learning",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10041-10051",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208366"}

@inproceedings{bb213289,
        AUTHOR = "Zhu, Y. and Zhu, F. and Zhan, Z. and Lin, B. and Jiao, J. and Chang, X. and Liang, X.",
        TITLE = "Vision-Dialog Navigation by Exploring Cross-Modal Memory",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10727-10736",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208367"}

@inproceedings{bb213290,
        AUTHOR = "Yang, T. and Zha, Z. and Zhang, H.",
        TITLE = "Making History Matter:
History-Advantage Sequence Training for Visual Dialog",
        BOOKTITLE = ICCV19,
        YEAR = "2019",
        PAGES = "2561-2569",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208368"}

@inproceedings{bb213291,
        AUTHOR = "Guo, D. and Xu, C. and Tao, D.C.",
        TITLE = "Image-Question-Answer Synergistic Network for Visual Dialog",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "10426-10435",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208369"}

@inproceedings{bb213292,
        AUTHOR = "Zheng, Z.L. and Wang, W.G. and Qi, S.Y. and Zhu, S.C.",
        TITLE = "Reasoning Visual Dialogs With Structural and Partial Observations",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "6662-6671",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208370"}

@inproceedings{bb213293,
        AUTHOR = "Bani, G. and Belli, D. and Dagan, G. and Geenen, A. and Skliar, A. and Venkatesh, A. and Baumgartner, T. and Bruni, E. and Fernandez, R.",
        TITLE = "Adding Object Detection Skills to Visual Dialogue Agents",
        BOOKTITLE = VL18,
        YEAR = "2018",
        PAGES = "IV:180-187",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208371"}

@inproceedings{bb213294,
        AUTHOR = "Yang, M. and Yang, N.S.R. and Zhang, K. and Tao, J.",
        TITLE = "Self-Talk: Responses to Users' Opinions and Challenges in Human
Computer Dialog",
        BOOKTITLE = ICPR18,
        YEAR = "2018",
        PAGES = "2839-2844",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208372"}

@inproceedings{bb213295,
        AUTHOR = "Jain, U. and Schwing, A. and Lazebnik, S.",
        TITLE = "Two Can Play This Game: Visual Dialog with Discriminative Question
Generation and Answering",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "5754-5763",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208373"}

@inproceedings{bb213296,
        AUTHOR = "Dokania, P.K. and Torr, P.H.S. and Siddharth, N. and Massiceti, D.",
        TITLE = "FLIPDIAL: A Generative Model for Two-Way Visual Dialogue",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "6097-6105",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208374"}

@inproceedings{bb213297,
        AUTHOR = "Wu, Q. and Wang, P. and Shen, C. and Reid, I.D. and van den Hengel, A.J.",
        TITLE = "Are You Talking to Me? Reasoned Visual Dialog Generation Through
Adversarial Learning",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "6106-6115",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208375"}

@inproceedings{bb213298,
        AUTHOR = "Kottur, S. and Moura, J.M.F. and Parikh, D. and Batra, D. and Rohrbach, M.",
        TITLE = "Visual Coreference Resolution in Visual Dialog Using Neural Module
Networks",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "XV: 160-178",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208376"}

@inproceedings{bb213299,
        AUTHOR = "Strub, F. and Seurin, M. and Perez, E. and de Vries, H. and Mary, J. and Preux, P. and Courville, A. and Pietquin, O.",
        TITLE = "Visual Reasoning with Multi-hop Feature Modulation",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "VI: 808-831",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT208377"}

Last update:Apr 18, 2024 at 11:38:49