@article{bb207000,
        AUTHOR = "Luo, H.N. and Lin, G.S. and Yao, Y.Z. and Liu, F.Y. and Liu, Z.C. and Tang, Z.M.",
        TITLE = "Depth and Video Segmentation Based Visual Attention for Embodied
Question Answering",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "6807-6819",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202110"}

@inproceedings{bb207001,
        AUTHOR = "Luo, H.N. and Lin, G.S. and Liu, Z.C. and Liu, F.Y. and Tang, Z.M. and Yao, Y.Z.",
        TITLE = "SegEQA: Video Segmentation Based Visual Attention for Embodied
Question Answering",
        BOOKTITLE = ICCV19,
        YEAR = "2019",
        PAGES = "9666-9675",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202111"}

@article{bb207002,
        AUTHOR = "Zhang, X. and Zhang, F.F. and Xu, C.S.",
        TITLE = "Reducing Vision-Answer Biases for Multiple-Choice VQA",
        JOURNAL = IP,
        VOLUME = "32",
        YEAR = "2023",
        PAGES = "4621-4634",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202112"}

@article{bb207003,
        AUTHOR = "Xiao, J.B. and Zhou, P. and Yao, A. and Li, Y.C. and Hong, R. and Yan, S.C. and Chua, T.S.",
        TITLE = "Contrastive Video Question Answering via Video Graph Transformer",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "13265-13280",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202113"}

@inproceedings{bb207004,
        AUTHOR = "Xiao, J.B. and Zhou, P. and Chua, T.S. and Yan, S.C.",
        TITLE = "Video Graph Transformer for Video Question Answering",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:39-58",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202114"}

@article{bb207005,
        AUTHOR = "Shen, W.X. and Song, J. and Zhu, X. and Li, G. and Shen, H.T.",
        TITLE = "End-to-End Pre-Training With Hierarchical Matching and Momentum
Contrast for Text-Video Retrieval",
        JOURNAL = IP,
        VOLUME = "32",
        YEAR = "2023",
        PAGES = "5017-5030",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202115"}

@article{bb207006,
        AUTHOR = "Jiang, J.J. and Liu, Z. and Zheng, N.N.",
        TITLE = "LiVLR: A Lightweight Visual-Linguistic Reasoning Framework for Video
Question Answering",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "5002-5013",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202116"}

@article{bb207007,
        AUTHOR = "Xu, F.F. and Zhu, Y. and Wang, C. and Cao, Y.Z. and Zhong, Z. and Li, X.M.",
        TITLE = "Spatio-Temporal Two-stage Fusion for video question answering",
        JOURNAL = CVIU,
        VOLUME = "237",
        YEAR = "2023",
        PAGES = "103821",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202117"}

@inproceedings{bb207008,
        AUTHOR = "Khan, Z. and Kumar, B.V. and Schulter, S. and Yu, X. and Fu, Y. and Chandraker, M.",
        TITLE = "Q: How to Specialize Large Vision-Language Models to Data-Scarce VQA
Tasks? A: Self-Train on Unlabeled Images!",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "15005-15015",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202118"}

@inproceedings{bb207009,
        AUTHOR = "Su, H.T. and Niu, Y. and Lin, X.D. and Hsu, W.H. and Chang, S.F.",
        TITLE = "Language Models are Causal Knowledge Extractors for Zero-shot Video
Question Answering",
        BOOKTITLE = L3D-IVU23,
        YEAR = "2023",
        PAGES = "4951-4960",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202119"}

@inproceedings{bb207010,
        AUTHOR = "Zang, C.Q. and Wang, H.Q. and Pei, M.T. and Liang, W.",
        TITLE = "Discovering the Real Association: Multimodal Causal Reasoning in
Video Question Answering",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "19027-19036",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202120"}

@inproceedings{bb207011,
        AUTHOR = "Gao, D.F. and Zhou, L. and Ji, L. and Zhu, L.C. and Yang, Y. and Shou, M.Z.",
        TITLE = "MIST: Multi-modal Iterative Spatial-Temporal Transformer for
Long-form Video Question Answering",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14773-14783",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202121"}

@inproceedings{bb207012,
        AUTHOR = "Khan, A.U. and Kuehne, H. and Wu, B. and Chheu, K. and Bousselham, W. and Gan, C. and Lobo, N. and Shah, M.",
        TITLE = "Learning Situation Hyper-Graphs for Video Question Answering",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14879-14889",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202122"}

@inproceedings{bb207013,
        AUTHOR = "Jahagirdar, S. and Mathew, M. and Karatzas, D. and Jawahar, C.V.",
        TITLE = "Watching the News: Towards VideoQA Models that can Read",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "4430-4439",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202123"}

@inproceedings{bb207014,
        AUTHOR = "Zhang, M. and Hwa, R. and Kovashka, A.",
        TITLE = "How to Practice VQA on a Resource-limited Target Domain",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "4440-4449",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202124"}

@inproceedings{bb207015,
        AUTHOR = "Lee, J. and Kang, W. and Kim, E.S.",
        TITLE = "Dense but Efficient VideoQA for Intricate Compositional Reasoning",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "1114-1123",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202125"}

@inproceedings{bb207016,
        AUTHOR = "Shen, R. and Inoue, N. and Shinoda, K.",
        TITLE = "Text-Guided Object Detector for Multi-modal Video Question Answering",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "1032-1042",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202126"}

@inproceedings{bb207017,
        AUTHOR = "Fang, S. and Wang, S.H. and Zhuo, J. and Han, X.Z. and Huang, Q.M.",
        TITLE = "Learning Linguistic Association Towards Efficient Text-Video Retrieval",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:254-270",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202127"}

@inproceedings{bb207018,
        AUTHOR = "Piergiovanni, A.J. and Morton, K. and Kuo, W.C. and Ryoo, M.S. and Angelova, A.",
        TITLE = "Video Question Answering with Iterative Video-Text Co-tokenization",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:76-94",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202128"}

@inproceedings{bb207019,
        AUTHOR = "Barmann, L. and Waibel, A.",
        TITLE = "Where did I leave my keys?: Episodic-Memory-Based Question Answering
on Egocentric Videos",
        BOOKTITLE = Ego4D-EPIC22,
        YEAR = "2022",
        PAGES = "1559-1567",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202129"}

@inproceedings{bb207020,
        AUTHOR = "Li, J.T. and Niu, L. and Zhang, L.Q.",
        TITLE = "From Representation to Reasoning: Towards both Evidence and
Commonsense Reasoning for Video Question-Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "21241-21250",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202130"}

@inproceedings{bb207021,
        AUTHOR = "Datta, S. and Dharur, S. and Cartillier, V. and Desai, R. and Khanna, M. and Batra, D. and Parikh, D.",
        TITLE = "Episodic Memory Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "19097-19106",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202131"}

@inproceedings{bb207022,
        AUTHOR = "Gandhi, M. and Gul, M.O. and Prakash, E. and Grunde McLaughlin, M. and Krishna, R. and Agrawala, M.",
        TITLE = "Measuring Compositional Consistency for Video Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5036-5045",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202132"}

@inproceedings{bb207023,
        AUTHOR = "Gorti, S.K. and Vouitsis, N. and Ma, J.W. and Golestan, K. and Volkovs, M. and Garg, A. and Yu, G.",
        TITLE = "X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "4996-5005",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202133"}

@inproceedings{bb207024,
        AUTHOR = "Li, J.C. and Tang, S.L. and Zhu, L.C. and Shi, H. and Huang, X. and Wu, F. and Yang, Y. and Zhuang, Y.T.",
        TITLE = "Adaptive Hierarchical Graph Reasoning with Semantic Coherence for
Video-and-Language Inference",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1847-1857",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202134"}

@inproceedings{bb207025,
        AUTHOR = "Zhang, M.X. and Yang, Y. and Chen, X. and Ji, Y.L. and Xu, X. and Li, J.J. and Shen, H.T.",
        TITLE = "Multi-stage Aggregated Transformer Network for Temporal Language
Localization in Videos",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "12664-12673",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202135"}

@inproceedings{bb207026,
        AUTHOR = "Kim, N. and Ha, S.J. and Kang, J.W.",
        TITLE = "Video Question Answering Using Language-Guided Deep Compressed-Domain
Video Feature",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1688-1697",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202136"}

@inproceedings{bb207027,
        AUTHOR = "Liu, F. and Liu, J. and Wang, W.N. and Lu, H.Q.",
        TITLE = "HAIR: Hierarchical Visual-Semantic Relational Reasoning for Video
Question Answering",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1678-1687",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202137"}

@inproceedings{bb207028,
        AUTHOR = "Yang, A. and Miech, A. and Sivic, J. and Laptev, I. and Schmid, C.",
        TITLE = "Just Ask:
Learning to Answer Questions from Millions of Narrated Videos",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1666-1677",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202138"}

@inproceedings{bb207029,
        AUTHOR = "Gao, D.F. and Wang, R.P. and Bai, Z. and Chen, X.L.",
        TITLE = "Env-QA: A Video Question Answering Benchmark for Comprehensive
Understanding of Dynamic Environments",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1655-1665",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202139"}

@inproceedings{bb207030,
        AUTHOR = "Yun, H. and Yu, Y. and Yang, W. and Lee, K. and Kim, G.",
        TITLE = "Pano-AVQA: Grounded Audio-Visual Question Answering on 360° Videos",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "2011-2021",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202140"}

@inproceedings{bb207031,
        AUTHOR = "Xu, L. and Huang, H. and Liu, J.",
        TITLE = "SUTD-TrafficQA: A Question Answering Benchmark and an Efficient
Network for Video Reasoning over Traffic Events",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "9873-9883",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202141"}

@inproceedings{bb207032,
        AUTHOR = "Park, J. and Lee, J.Y. and Sohn, K.H.",
        TITLE = "Bridge to Answer: Structure-aware Graph Interaction Network for Video
Question Answering",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "15521-15530",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202142"}

@inproceedings{bb207033,
        AUTHOR = "Chen, X.W. and Liu, R. and Song, X.M. and Han, Y.H.",
        TITLE = "Locating Visual Explanations for Video Question Answering",
        BOOKTITLE = MMMod21,
        YEAR = "2021",
        PAGES = "I:290-302",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202143"}

@inproceedings{bb207034,
        AUTHOR = "Garcia, N. and Nakashima, Y.",
        TITLE = "Knowledge-based Video Question Answering with Unsupervised Scene
Descriptions",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XVIII:581-598",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202144"}

@inproceedings{bb207035,
        AUTHOR = "Kim, J. and Ma, M. and Pham, T. and Kim, K. and Yoo, C.D.",
        TITLE = "Modality Shifting Attention Network for Multi-Modal Video Question
Answering",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10103-10112",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202145"}

@inproceedings{bb207036,
        AUTHOR = "Jiang, M. and Chen, S. and Yang, J. and Zhao, Q.",
        TITLE = "Fantastic Answers and Where to Find Them: Immersive Question-Directed
Visual Attention",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "2977-2986",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202146"}

@inproceedings{bb207037,
        AUTHOR = "Yang, Z. and Garcia, N. and Chu, C. and Otani, M. and Nakashima, Y. and Takemura, H.",
        TITLE = "BERT Representations for Video Question Answering",
        BOOKTITLE = WACV20,
        YEAR = "2020",
        PAGES = "1545-1554",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202147"}

@inproceedings{bb207038,
        AUTHOR = "Fan, C.Y. and Zhang, X.F. and Zhang, S. and Wang, W.S. and Zhang, C. and Huang, H.",
        TITLE = "Heterogeneous Memory Enhanced Multimodal Attention Model for Video
Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "1999-2007",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202148"}

@inproceedings{bb207039,
        AUTHOR = "Kim, J.Y. and Ma, M. and Kim, K. and Kim, S. and Yoo, C.D.",
        TITLE = "Progressive Attention Memory Network for Movie Story Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "8329-8338",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202149"}

@inproceedings{bb207040,
        AUTHOR = "Liu, C.N. and Chen, D.J. and Chen, H.T. and Liu, T.L.",
        TITLE = "A2A: Attention to Attention Reasoning for Movie Question Answering",
        BOOKTITLE = ACCV18,
        YEAR = "2018",
        PAGES = "VI:404-419",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202150"}

@inproceedings{bb207041,
        AUTHOR = "Gao, J. and Ge, R. and Chen, K. and Nevatia, R.",
        TITLE = "Motion-Appearance Co-memory Networks for Video Question Answering",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "6576-6585",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202151"}

@inproceedings{bb207042,
        AUTHOR = "Kim, K.M. and Choi, S.H. and Kim, J.H. and Zhang, B.T.",
        TITLE = "Multimodal Dual Attention Memory for Video Story Question Answering",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "XV: 698-713",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202152"}

@inproceedings{bb207043,
        AUTHOR = "Yu, Y.J. and Kim, J.S. and Kim, G.",
        TITLE = "A Joint Sequence Fusion Model for Video Question Answering and
Retrieval",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "VII: 487-503",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202153"}

@inproceedings{bb207044,
        AUTHOR = "Hasan Chowdhury, M.I. and Nguyen, K. and Sridharan, S. and Fookes, C.",
        TITLE = "Hierarchical Relational Attention for Video Question Answering",
        BOOKTITLE = ICIP18,
        YEAR = "2018",
        PAGES = "599-603",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202154"}

@inproceedings{bb207045,
        AUTHOR = "Mun, J. and Seo, P.H. and Jung, I. and Han, B.H.",
        TITLE = "MarioQA: Answering Questions by Watching Gameplay Videos",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "2886-2894",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202155"}

@inproceedings{bb207046,
        AUTHOR = "Yu, Y. and Ko, H. and Choi, J. and Kim, G.",
        TITLE = "End-to-End Concept Word Detection for Video Captioning, Retrieval,
and Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "3261-3269",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202156"}

@article{bb207047,
        AUTHOR = "Kafle, K. and Kanan, C.",
        TITLE = "Visual question answering:
Datasets, algorithms, and future challenges",
        JOURNAL = CVIU,
        VOLUME = "163",
        YEAR = "2017",
        NUMBER = "1",
        PAGES = "3-20",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202159"}

@article{bb207048,
        AUTHOR = "Wu, Q. and Teney, D. and Wang, P. and Shen, C.H. and Dick, A. and van den Hengel, A.J.",
        TITLE = "Visual question answering: A survey of methods and datasets",
        JOURNAL = CVIU,
        VOLUME = "163",
        YEAR = "2017",
        NUMBER = "1",
        PAGES = "21-40",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202160"}

@article{bb207049,
        AUTHOR = "Teney, D. and Wu, Q. and van den Hengel, A.J.",
        TITLE = "Visual Question Answering: A Tutorial",
        JOURNAL = SPMag,
        VOLUME = "34",
        YEAR = "2017",
        NUMBER = "6",
        MONTH = "November",
        PAGES = "63-75",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202161"}

@inproceedings{bb207050,
        AUTHOR = "Teney, D. and Liu, L. and van den Hengel, A.J.",
        TITLE = "Graph-Structured Representations for Visual Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "3233-3241",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202162"}

@inproceedings{bb207051,
        AUTHOR = "Teney, D. and van den Hengel, A.J.",
        TITLE = "Visual Question Answering as a Meta Learning Task",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "XV: 229-245",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202163"}

@inproceedings{bb207052,
        AUTHOR = "Teney, D. and Abbasnejad, E. and van den Hengel, A.J.",
        TITLE = "Unshuffling Data for Improved Generalization in Visual Question
Answering",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1397-1407",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202164"}

@article{bb207053,
        AUTHOR = "Wu, Q. and Shen, C.H. and Wang, P. and Dick, A. and van den Hengel, A.J.",
        TITLE = "Image Captioning and Visual Question Answering Based on Attributes
and External Knowledge",
        JOURNAL = PAMI,
        VOLUME = "40",
        YEAR = "2018",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "1367-1381",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202165"}

@inproceedings{bb207054,
        AUTHOR = "Wu, Q. and Wang, P. and Shen, C.H. and Dick, A. and van den Hengel, A.J.",
        TITLE = "Ask Me Anything: Free-Form Visual Question Answering Based on
Knowledge from External Sources",
        BOOKTITLE = CVPR16,
        YEAR = "2016",
        PAGES = "4622-4630",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202166"}

@article{bb207055,
        AUTHOR = "Tommasi, T. and Mallya, A. and Plummer, B.A. and Lazebnik, S. and Berg, A.C. and Berg, T.L.",
        TITLE = "Combining Multiple Cues for Visual Madlibs Question Answering",
        JOURNAL = IJCV,
        VOLUME = "127",
        YEAR = "2019",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "38-60",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202167"}

@inproceedings{bb207056,
        AUTHOR = "Tommasi, T. and Mallya, A. and Plummer, B.A. and Lazebnik, S. and Berg, A.C. and Berg, T.L.",
        TITLE = "Solving Visual Madlibs with Multiple Cues",
        BOOKTITLE = BMVC16,
        YEAR = "2016",
        PAGES = "xx-yy",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202168"}

@inproceedings{bb207057,
        AUTHOR = "Yu, L.C. and Park, E. and Berg, A.C. and Berg, T.L.",
        TITLE = "Visual Madlibs:
Fill in the Blank Description Generation and Question Answering",
        BOOKTITLE = ICCV15,
        YEAR = "2015",
        PAGES = "2461-2469",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202169"}

@article{bb207058,
        AUTHOR = "Liu, F. and Xiang, T. and Hospedales, T.M. and Yang, W.K. and Sun, C.Y.",
        TITLE = "Inverse Visual Question Answering:
A New Benchmark and VQA Diagnosis Tool",
        JOURNAL = PAMI,
        VOLUME = "42",
        YEAR = "2020",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "460-474",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202170"}

@inproceedings{bb207059,
        AUTHOR = "Liu, F. and Xiang, T. and Hospedales, T.M. and Yang, W.K. and Sun, C.Y.",
        TITLE = "iVQA: Inverse Visual Question Answering",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "8611-8619",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202171"}

@article{bb207060,
        AUTHOR = "Patil, C. and Patwardhan, M.",
        TITLE = "Visual Question Generation: The State of the Art",
        JOURNAL = Surveys,
        VOLUME = "53",
        YEAR = "2020",
        NUMBER = "3",
        MONTH = "May",
        PAGES = "xx-yy",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202172"}

@article{bb207061,
        AUTHOR = "He, F.J. and Wang, Y.X. and Miao, X.L. and Sun, X.",
        TITLE = "Interpretable visual reasoning: A survey",
        JOURNAL = IVC,
        VOLUME = "112",
        YEAR = "2021",
        PAGES = "104194",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202173"}

@article{bb207062,
        AUTHOR = "Sharma, H. and Jalal, A.S.",
        TITLE = "A survey of methods, datasets and evaluation metrics for visual
question answering",
        JOURNAL = IVC,
        VOLUME = "116",
        YEAR = "2021",
        PAGES = "104327",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202174"}

@article{bb207063,
        AUTHOR = "Yang, L. and Jiang, H. and Song, Q. and Guo, J.",
        TITLE = "A Survey on Long-Tailed Visual Recognition",
        JOURNAL = IJCV,
        VOLUME = "130",
        YEAR = "2022",
        NUMBER = "7",
        MONTH = "July",
        PAGES = "1837-1872",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202175"}

@article{bb207064,
        AUTHOR = "Zhao, W.L. and Rao, Y.M. and Tang, Y.S. and Zhou, J. and Lu, J.W.",
        TITLE = "VideoABC: A Real-World Video Dataset for Abductive Visual Reasoning",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "6048-6061",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202176"}

@article{bb207065,
        AUTHOR = "Lahouti, F. and Kostina, V. and Hassibi, B.",
        TITLE = "How to Query an Oracle? Efficient Strategies to Label Data",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "7597-7609",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202177"}

@inproceedings{bb207066,
        AUTHOR = "Zhu, L. and Ning, R. and Li, J. and Xin, C.S. and Wu, H.Y.",
        TITLE = "Most and Least Retrievable Images in Visual-Language Query Systems",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:1-18",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202178"}

@inproceedings{bb207067,
        AUTHOR = "Salewski, L. and Emde, C. and Do, V. and Akata, Z. and Lukasiewicz, T.",
        TITLE = "e-ViL: A Dataset and Benchmark for Natural Language Explanations in
Vision-Language Tasks",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1224-1234",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202179"}

@inproceedings{bb207068,
        AUTHOR = "Gupta, V. and Patro, B.N. and Parihar, H. and Namboodiri, V.P.",
        TITLE = "VQuAD: Video Question Answering Diagnostic Dataset",
        BOOKTITLE = Novelty22,
        YEAR = "2022",
        PAGES = "282-291",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202180"}

@inproceedings{bb207069,
        AUTHOR = "Nishimura, T. and Sakoda, K. and Hashimoto, A. and Ushiku, Y. and Tanaka, N. and Ono, F. and Kameko, H. and Mori, S.",
        TITLE = "Egocentric Biochemical Video-and-Language Dataset",
        BOOKTITLE = CLVL21,
        YEAR = "2021",
        PAGES = "3122-3126",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202181"}

@inproceedings{bb207070,
        AUTHOR = "Zhang, M. and Maidment, T. and Diab, A. and Kovashka, A. and Hwa, R.",
        TITLE = "Domain-robust VQA with diverse datasets and methods but no target
labels",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "7042-7052",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202182"}

@inproceedings{bb207071,
        AUTHOR = "Mathew, M. and Karatzas, D. and Jawahar, C.V.",
        TITLE = "DocVQA: A Dataset for VQA on Document Images",
        BOOKTITLE = WACV21,
        YEAR = "2021",
        PAGES = "2199-2208",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202183"}

@inproceedings{bb207072,
        AUTHOR = "Patel, D. and Parikh, R. and Shastri, Y.",
        TITLE = "Recent Advances in Video Question Answering:
A Review of Datasets and Methods",
        BOOKTITLE = VTIUR20,
        YEAR = "2020",
        PAGES = "339-356",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202184"}

@inproceedings{bb207073,
        AUTHOR = "Fan, C.",
        TITLE = "EgoVQA: An Egocentric Video Question Answering Benchmark Dataset",
        BOOKTITLE = EPIC19,
        YEAR = "2019",
        PAGES = "4359-4366",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202185"}

@inproceedings{bb207074,
        AUTHOR = "Hudson, D.A. and Manning, C.D.",
        TITLE = "GQA: A New Dataset for Real-World Visual Reasoning and Compositional
Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "6693-6702",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202186"}

@inproceedings{bb207075,
        AUTHOR = "Yang, G.Y.R. and Ganichev, I. and Wang, X.J. and Shlens, J. and Sussillo, D.",
        TITLE = "A Dataset and Architecture for Visual Reasoning with a Working Memory",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "X: 729-745",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202187"}

@inproceedings{bb207076,
        AUTHOR = "Gan, C. and Li, Y. and Li, H. and Sun, C. and Gong, B.",
        TITLE = "VQS: Linking Segmentations to Questions and Answers for Supervised
Attention in VQA and Question-Focused Semantic Segmentation",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "1829-1838",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202188"}

@inproceedings{bb207077,
        AUTHOR = "Maharaj, T. and Ballas, N. and Rohrbach, A. and Courville, A. and Pal, C.",
        TITLE = "A Dataset and Exploration of Models for Understanding Video Data
through Fill-in-the-Blank Question-Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "7359-7368",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202189"}

@article{bb207078,
        AUTHOR = "Das, A. and Kottur, S. and Gupta, K. and Singh, A. and Yadav, D. and Lee, S. and Moura, J.M.F. and Parikh, D. and Batra, D.",
        TITLE = "Visual Dialog",
        JOURNAL = PAMI,
        VOLUME = "41",
        YEAR = "2019",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "1242-1256",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202190"}

@article{bb207079,
        AUTHOR = "Zhao, Z. and Zhang, Z. and Jiang, X.H. and Cai, D.",
        TITLE = "Multi-Turn Video Question Answering via Hierarchical Attention
Context Reinforced Networks",
        JOURNAL = IP,
        VOLUME = "28",
        YEAR = "2019",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "3860-3872",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202191"}

@article{bb207080,
        AUTHOR = "Gu, M. and Zhao, Z. and Jin, W. and Cai, D. and Wu, F.",
        TITLE = "Video Dialog via Multi-Grained Convolutional Self-Attention Context
Multi-Modal Networks",
        JOURNAL = CirSysVideo,
        VOLUME = "30",
        YEAR = "2020",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "4453-4466",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202192"}

@article{bb207081,
        AUTHOR = "Guo, D. and Wang, H. and Wang, S. and Wang, M.",
        TITLE = "Textual-Visual Reference-Aware Attention Network for Visual Dialog",
        JOURNAL = IP,
        VOLUME = "29",
        YEAR = "2020",
        PAGES = "6655-6666",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202193"}

@article{bb207082,
        AUTHOR = "Patro, B.N. and Anupriy and Namboodiri, V.P.",
        TITLE = "Probabilistic framework for solving visual dialog",
        JOURNAL = PR,
        VOLUME = "110",
        YEAR = "2021",
        PAGES = "107586",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202194"}

@article{bb207083,
        AUTHOR = "Zhao, L. and Lyu, X.Y. and Song, J.K. and Gao, L.L.",
        TITLE = "GuessWhich? Visual dialog with attentive memory network",
        JOURNAL = PR,
        VOLUME = "114",
        YEAR = "2021",
        PAGES = "107823",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202195"}

@article{bb207084,
        AUTHOR = "Jiang, T.L. and Shao, H.L. and Tian, X. and Ji, Y. and Liu, C.P.",
        TITLE = "Aligning vision-language for graph inference in visual dialog",
        JOURNAL = IVC,
        VOLUME = "116",
        YEAR = "2021",
        PAGES = "104316",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202196"}

@article{bb207085,
        AUTHOR = "Guo, D. and Wang, H. and Wang, M.",
        TITLE = "Context-Aware Graph Inference With Knowledge Distillation for Visual
Dialog",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "6056-6073",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202197"}

@inproceedings{bb207086,
        AUTHOR = "Guo, D. and Wang, H. and Zhang, H.W. and Zha, Z.J. and Wang, M.",
        TITLE = "Iterative Context-Aware Graph Inference for Visual Dialog",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10052-10061",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202198"}

@article{bb207087,
        AUTHOR = "Patro, B.N. and Anupriy and Namboodiri, V.P.",
        TITLE = "Explanation vs. attention: A two-player game to obtain attention for
VQA and visual dialog",
        JOURNAL = PR,
        VOLUME = "132",
        YEAR = "2022",
        PAGES = "108898",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202199"}

@article{bb207088,
        AUTHOR = "Zhu, Y. and Wu, Y. and Yang, Y. and Yan, Y.",
        TITLE = "Saying the Unseen: Video Descriptions via Dialog Agents",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "7190-7204",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202200"}

@article{bb207089,
        AUTHOR = "Huang, Y. and Wang, Y.M. and Wang, L.",
        TITLE = "Efficient Image and Sentence Matching",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "2970-2983",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202201"}

@article{bb207090,
        AUTHOR = "Zhao, L. and Li, J.L. and Gao, L.L. and Rao, Y. and Song, J.K. and Shen, H.T.",
        TITLE = "Heterogeneous Knowledge Network for Visual Dialog",
        JOURNAL = CirSysVideo,
        VOLUME = "33",
        YEAR = "2023",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "861-871",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202202"}

@article{bb207091,
        AUTHOR = "Bucinca, Z. and Yemez, Y. and Erzin, E. and Sezgin, M.",
        TITLE = "AffectON: Incorporating Affect Into Dialog Generation",
        JOURNAL = AffCom,
        VOLUME = "14",
        YEAR = "2023",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "823-835",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202203"}

@article{bb207092,
        AUTHOR = "Yu, H. and Ko, Y.J.",
        TITLE = "Enriching the dialogue state tracking model with a asyntactic
discourse graph",
        JOURNAL = PRL,
        VOLUME = "169",
        YEAR = "2023",
        PAGES = "81-86",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202204"}

@article{bb207093,
        AUTHOR = "Wu, Y.X. and Liao, L. and Zhang, G.Y. and Lei, W.Q. and Zhao, G.S. and Qian, X.M. and Chua, T.S.",
        TITLE = "State Graph Reasoning for Multimodal Conversational Recommendation",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "3113-3124",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202205"}

@article{bb207094,
        AUTHOR = "Firdaus, M. and Thangavelu, N. and Ekbal, A. and Bhattacharyya, P.",
        TITLE = "I Enjoy Writing and Playing, Do You?: A Personalized and Emotion
Grounded Dialogue Agent Using Generative Adversarial Network",
        JOURNAL = AffCom,
        VOLUME = "14",
        YEAR = "2023",
        NUMBER = "3",
        MONTH = "July",
        PAGES = "2127-2138",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202206"}

@inproceedings{bb207095,
        AUTHOR = "Madasu, A. and Lal, V.",
        TITLE = "Is Multimodal Vision Supervision Beneficial to Language?",
        BOOKTITLE = NFVLR23,
        YEAR = "2023",
        PAGES = "2637-2642",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202207"}

@inproceedings{bb207096,
        AUTHOR = "Ashutosh, K. and Girdhar, R. and Torresani, L. and Grauman, K.",
        TITLE = "HierVL: Learning Hierarchical Video-Language Embeddings",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23066-23078",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202208"}

@inproceedings{bb207097,
        AUTHOR = "Smith, J.S. and Cascante Bonilla, P. and Arbelle, A. and Kim, D.H. and Panda, R. and Cox, D. and Yang, D. and Kira, Z. and Feris, R. and Karlinsky, L.",
        TITLE = "ConStruct-VL: Data-Free Continual Structured VL Concepts Learning*",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14994-15004",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202209"}

@inproceedings{bb207098,
        AUTHOR = "Chen, Y.X. and Ma, Z.Y. and Zhang, Z.Q. and Qi, Z.G. and Yuan, C.F. and Shan, Y. and Li, B. and Hu, W.M. and Qie, X. and Wu, J.P.",
        TITLE = "ViLEM: Visual-Language Error Modeling for Image-Text Retrieval",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "11018-11027",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202210"}

@inproceedings{bb207099,
        AUTHOR = "Huang, J.J. and Li, Y. and Feng, J.S. and Wu, X.L. and Sun, X.S. and Ji, R.R.",
        TITLE = "Clover: Towards A Unified Video-Language Alignment and Fusion Model",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14856-14866",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202211"}

Last update:Nov 30, 2023 at 15:51:27