@article{bb207000, AUTHOR = "Luo, H.N. and Lin, G.S. and Yao, Y.Z. and Liu, F.Y. and Liu, Z.C. and Tang, Z.M.", TITLE = "Depth and Video Segmentation Based Visual Attention for Embodied Question Answering", JOURNAL = PAMI, VOLUME = "45", YEAR = "2023", NUMBER = "6", MONTH = "June", PAGES = "6807-6819", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202110"} @inproceedings{bb207001, AUTHOR = "Luo, H.N. and Lin, G.S. and Liu, Z.C. and Liu, F.Y. and Tang, Z.M. and Yao, Y.Z.", TITLE = "SegEQA: Video Segmentation Based Visual Attention for Embodied Question Answering", BOOKTITLE = ICCV19, YEAR = "2019", PAGES = "9666-9675", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202111"} @article{bb207002, AUTHOR = "Zhang, X. and Zhang, F.F. and Xu, C.S.", TITLE = "Reducing Vision-Answer Biases for Multiple-Choice VQA", JOURNAL = IP, VOLUME = "32", YEAR = "2023", PAGES = "4621-4634", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202112"} @article{bb207003, AUTHOR = "Xiao, J.B. and Zhou, P. and Yao, A. and Li, Y.C. and Hong, R. and Yan, S.C. and Chua, T.S.", TITLE = "Contrastive Video Question Answering via Video Graph Transformer", JOURNAL = PAMI, VOLUME = "45", YEAR = "2023", NUMBER = "11", MONTH = "November", PAGES = "13265-13280", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202113"} @inproceedings{bb207004, AUTHOR = "Xiao, J.B. and Zhou, P. and Chua, T.S. and Yan, S.C.", TITLE = "Video Graph Transformer for Video Question Answering", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:39-58", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202114"} @article{bb207005, AUTHOR = "Shen, W.X. and Song, J. and Zhu, X. and Li, G. and Shen, H.T.", TITLE = "End-to-End Pre-Training With Hierarchical Matching and Momentum Contrast for Text-Video Retrieval", JOURNAL = IP, VOLUME = "32", YEAR = "2023", PAGES = "5017-5030", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202115"} @article{bb207006, AUTHOR = "Jiang, J.J. and Liu, Z. and Zheng, N.N.", TITLE = "LiVLR: A Lightweight Visual-Linguistic Reasoning Framework for Video Question Answering", JOURNAL = MultMed, VOLUME = "25", YEAR = "2023", PAGES = "5002-5013", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202116"} @article{bb207007, AUTHOR = "Xu, F.F. and Zhu, Y. and Wang, C. and Cao, Y.Z. and Zhong, Z. and Li, X.M.", TITLE = "Spatio-Temporal Two-stage Fusion for video question answering", JOURNAL = CVIU, VOLUME = "237", YEAR = "2023", PAGES = "103821", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202117"} @inproceedings{bb207008, AUTHOR = "Khan, Z. and Kumar, B.V. and Schulter, S. and Yu, X. and Fu, Y. and Chandraker, M.", TITLE = "Q: How to Specialize Large Vision-Language Models to Data-Scarce VQA Tasks? A: Self-Train on Unlabeled Images!", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "15005-15015", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202118"} @inproceedings{bb207009, AUTHOR = "Su, H.T. and Niu, Y. and Lin, X.D. and Hsu, W.H. and Chang, S.F.", TITLE = "Language Models are Causal Knowledge Extractors for Zero-shot Video Question Answering", BOOKTITLE = L3D-IVU23, YEAR = "2023", PAGES = "4951-4960", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202119"} @inproceedings{bb207010, AUTHOR = "Zang, C.Q. and Wang, H.Q. and Pei, M.T. and Liang, W.", TITLE = "Discovering the Real Association: Multimodal Causal Reasoning in Video Question Answering", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "19027-19036", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202120"} @inproceedings{bb207011, AUTHOR = "Gao, D.F. and Zhou, L. and Ji, L. and Zhu, L.C. and Yang, Y. and Shou, M.Z.", TITLE = "MIST: Multi-modal Iterative Spatial-Temporal Transformer for Long-form Video Question Answering", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14773-14783", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202121"} @inproceedings{bb207012, AUTHOR = "Khan, A.U. and Kuehne, H. and Wu, B. and Chheu, K. and Bousselham, W. and Gan, C. and Lobo, N. and Shah, M.", TITLE = "Learning Situation Hyper-Graphs for Video Question Answering", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14879-14889", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202122"} @inproceedings{bb207013, AUTHOR = "Jahagirdar, S. and Mathew, M. and Karatzas, D. and Jawahar, C.V.", TITLE = "Watching the News: Towards VideoQA Models that can Read", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "4430-4439", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202123"} @inproceedings{bb207014, AUTHOR = "Zhang, M. and Hwa, R. and Kovashka, A.", TITLE = "How to Practice VQA on a Resource-limited Target Domain", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "4440-4449", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202124"} @inproceedings{bb207015, AUTHOR = "Lee, J. and Kang, W. and Kim, E.S.", TITLE = "Dense but Efficient VideoQA for Intricate Compositional Reasoning", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "1114-1123", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202125"} @inproceedings{bb207016, AUTHOR = "Shen, R. and Inoue, N. and Shinoda, K.", TITLE = "Text-Guided Object Detector for Multi-modal Video Question Answering", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "1032-1042", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202126"} @inproceedings{bb207017, AUTHOR = "Fang, S. and Wang, S.H. and Zhuo, J. and Han, X.Z. and Huang, Q.M.", TITLE = "Learning Linguistic Association Towards Efficient Text-Video Retrieval", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:254-270", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202127"} @inproceedings{bb207018, AUTHOR = "Piergiovanni, A.J. and Morton, K. and Kuo, W.C. and Ryoo, M.S. and Angelova, A.", TITLE = "Video Question Answering with Iterative Video-Text Co-tokenization", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:76-94", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202128"} @inproceedings{bb207019, AUTHOR = "Barmann, L. and Waibel, A.", TITLE = "Where did I leave my keys?: Episodic-Memory-Based Question Answering on Egocentric Videos", BOOKTITLE = Ego4D-EPIC22, YEAR = "2022", PAGES = "1559-1567", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202129"} @inproceedings{bb207020, AUTHOR = "Li, J.T. and Niu, L. and Zhang, L.Q.", TITLE = "From Representation to Reasoning: Towards both Evidence and Commonsense Reasoning for Video Question-Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "21241-21250", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202130"} @inproceedings{bb207021, AUTHOR = "Datta, S. and Dharur, S. and Cartillier, V. and Desai, R. and Khanna, M. and Batra, D. and Parikh, D.", TITLE = "Episodic Memory Question Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "19097-19106", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202131"} @inproceedings{bb207022, AUTHOR = "Gandhi, M. and Gul, M.O. and Prakash, E. and Grunde McLaughlin, M. and Krishna, R. and Agrawala, M.", TITLE = "Measuring Compositional Consistency for Video Question Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "5036-5045", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202132"} @inproceedings{bb207023, AUTHOR = "Gorti, S.K. and Vouitsis, N. and Ma, J.W. and Golestan, K. and Volkovs, M. and Garg, A. and Yu, G.", TITLE = "X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "4996-5005", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202133"} @inproceedings{bb207024, AUTHOR = "Li, J.C. and Tang, S.L. and Zhu, L.C. and Shi, H. and Huang, X. and Wu, F. and Yang, Y. and Zhuang, Y.T.", TITLE = "Adaptive Hierarchical Graph Reasoning with Semantic Coherence for Video-and-Language Inference", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1847-1857", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202134"} @inproceedings{bb207025, AUTHOR = "Zhang, M.X. and Yang, Y. and Chen, X. and Ji, Y.L. and Xu, X. and Li, J.J. and Shen, H.T.", TITLE = "Multi-stage Aggregated Transformer Network for Temporal Language Localization in Videos", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "12664-12673", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202135"} @inproceedings{bb207026, AUTHOR = "Kim, N. and Ha, S.J. and Kang, J.W.", TITLE = "Video Question Answering Using Language-Guided Deep Compressed-Domain Video Feature", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1688-1697", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202136"} @inproceedings{bb207027, AUTHOR = "Liu, F. and Liu, J. and Wang, W.N. and Lu, H.Q.", TITLE = "HAIR: Hierarchical Visual-Semantic Relational Reasoning for Video Question Answering", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1678-1687", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202137"} @inproceedings{bb207028, AUTHOR = "Yang, A. and Miech, A. and Sivic, J. and Laptev, I. and Schmid, C.", TITLE = "Just Ask: Learning to Answer Questions from Millions of Narrated Videos", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1666-1677", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202138"} @inproceedings{bb207029, AUTHOR = "Gao, D.F. and Wang, R.P. and Bai, Z. and Chen, X.L.", TITLE = "Env-QA: A Video Question Answering Benchmark for Comprehensive Understanding of Dynamic Environments", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1655-1665", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202139"} @inproceedings{bb207030, AUTHOR = "Yun, H. and Yu, Y. and Yang, W. and Lee, K. and Kim, G.", TITLE = "Pano-AVQA: Grounded Audio-Visual Question Answering on 360° Videos", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "2011-2021", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202140"} @inproceedings{bb207031, AUTHOR = "Xu, L. and Huang, H. and Liu, J.", TITLE = "SUTD-TrafficQA: A Question Answering Benchmark and an Efficient Network for Video Reasoning over Traffic Events", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "9873-9883", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202141"} @inproceedings{bb207032, AUTHOR = "Park, J. and Lee, J.Y. and Sohn, K.H.", TITLE = "Bridge to Answer: Structure-aware Graph Interaction Network for Video Question Answering", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "15521-15530", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202142"} @inproceedings{bb207033, AUTHOR = "Chen, X.W. and Liu, R. and Song, X.M. and Han, Y.H.", TITLE = "Locating Visual Explanations for Video Question Answering", BOOKTITLE = MMMod21, YEAR = "2021", PAGES = "I:290-302", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202143"} @inproceedings{bb207034, AUTHOR = "Garcia, N. and Nakashima, Y.", TITLE = "Knowledge-based Video Question Answering with Unsupervised Scene Descriptions", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "XVIII:581-598", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202144"} @inproceedings{bb207035, AUTHOR = "Kim, J. and Ma, M. and Pham, T. and Kim, K. and Yoo, C.D.", TITLE = "Modality Shifting Attention Network for Multi-Modal Video Question Answering", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "10103-10112", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202145"} @inproceedings{bb207036, AUTHOR = "Jiang, M. and Chen, S. and Yang, J. and Zhao, Q.", TITLE = "Fantastic Answers and Where to Find Them: Immersive Question-Directed Visual Attention", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "2977-2986", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202146"} @inproceedings{bb207037, AUTHOR = "Yang, Z. and Garcia, N. and Chu, C. and Otani, M. and Nakashima, Y. and Takemura, H.", TITLE = "BERT Representations for Video Question Answering", BOOKTITLE = WACV20, YEAR = "2020", PAGES = "1545-1554", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202147"} @inproceedings{bb207038, AUTHOR = "Fan, C.Y. and Zhang, X.F. and Zhang, S. and Wang, W.S. and Zhang, C. and Huang, H.", TITLE = "Heterogeneous Memory Enhanced Multimodal Attention Model for Video Question Answering", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "1999-2007", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202148"} @inproceedings{bb207039, AUTHOR = "Kim, J.Y. and Ma, M. and Kim, K. and Kim, S. and Yoo, C.D.", TITLE = "Progressive Attention Memory Network for Movie Story Question Answering", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "8329-8338", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202149"} @inproceedings{bb207040, AUTHOR = "Liu, C.N. and Chen, D.J. and Chen, H.T. and Liu, T.L.", TITLE = "A2A: Attention to Attention Reasoning for Movie Question Answering", BOOKTITLE = ACCV18, YEAR = "2018", PAGES = "VI:404-419", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202150"} @inproceedings{bb207041, AUTHOR = "Gao, J. and Ge, R. and Chen, K. and Nevatia, R.", TITLE = "Motion-Appearance Co-memory Networks for Video Question Answering", BOOKTITLE = CVPR18, YEAR = "2018", PAGES = "6576-6585", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202151"} @inproceedings{bb207042, AUTHOR = "Kim, K.M. and Choi, S.H. and Kim, J.H. and Zhang, B.T.", TITLE = "Multimodal Dual Attention Memory for Video Story Question Answering", BOOKTITLE = ECCV18, YEAR = "2018", PAGES = "XV: 698-713", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202152"} @inproceedings{bb207043, AUTHOR = "Yu, Y.J. and Kim, J.S. and Kim, G.", TITLE = "A Joint Sequence Fusion Model for Video Question Answering and Retrieval", BOOKTITLE = ECCV18, YEAR = "2018", PAGES = "VII: 487-503", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202153"} @inproceedings{bb207044, AUTHOR = "Hasan Chowdhury, M.I. and Nguyen, K. and Sridharan, S. and Fookes, C.", TITLE = "Hierarchical Relational Attention for Video Question Answering", BOOKTITLE = ICIP18, YEAR = "2018", PAGES = "599-603", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202154"} @inproceedings{bb207045, AUTHOR = "Mun, J. and Seo, P.H. and Jung, I. and Han, B.H.", TITLE = "MarioQA: Answering Questions by Watching Gameplay Videos", BOOKTITLE = ICCV17, YEAR = "2017", PAGES = "2886-2894", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202155"} @inproceedings{bb207046, AUTHOR = "Yu, Y. and Ko, H. and Choi, J. and Kim, G.", TITLE = "End-to-End Concept Word Detection for Video Captioning, Retrieval, and Question Answering", BOOKTITLE = CVPR17, YEAR = "2017", PAGES = "3261-3269", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT202156"} @article{bb207047, AUTHOR = "Kafle, K. and Kanan, C.", TITLE = "Visual question answering: Datasets, algorithms, and future challenges", JOURNAL = CVIU, VOLUME = "163", YEAR = "2017", NUMBER = "1", PAGES = "3-20", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202159"} @article{bb207048, AUTHOR = "Wu, Q. and Teney, D. and Wang, P. and Shen, C.H. and Dick, A. and van den Hengel, A.J.", TITLE = "Visual question answering: A survey of methods and datasets", JOURNAL = CVIU, VOLUME = "163", YEAR = "2017", NUMBER = "1", PAGES = "21-40", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202160"} @article{bb207049, AUTHOR = "Teney, D. and Wu, Q. and van den Hengel, A.J.", TITLE = "Visual Question Answering: A Tutorial", JOURNAL = SPMag, VOLUME = "34", YEAR = "2017", NUMBER = "6", MONTH = "November", PAGES = "63-75", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202161"} @inproceedings{bb207050, AUTHOR = "Teney, D. and Liu, L. and van den Hengel, A.J.", TITLE = "Graph-Structured Representations for Visual Question Answering", BOOKTITLE = CVPR17, YEAR = "2017", PAGES = "3233-3241", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202162"} @inproceedings{bb207051, AUTHOR = "Teney, D. and van den Hengel, A.J.", TITLE = "Visual Question Answering as a Meta Learning Task", BOOKTITLE = ECCV18, YEAR = "2018", PAGES = "XV: 229-245", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202163"} @inproceedings{bb207052, AUTHOR = "Teney, D. and Abbasnejad, E. and van den Hengel, A.J.", TITLE = "Unshuffling Data for Improved Generalization in Visual Question Answering", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1397-1407", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202164"} @article{bb207053, AUTHOR = "Wu, Q. and Shen, C.H. and Wang, P. and Dick, A. and van den Hengel, A.J.", TITLE = "Image Captioning and Visual Question Answering Based on Attributes and External Knowledge", JOURNAL = PAMI, VOLUME = "40", YEAR = "2018", NUMBER = "6", MONTH = "June", PAGES = "1367-1381", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202165"} @inproceedings{bb207054, AUTHOR = "Wu, Q. and Wang, P. and Shen, C.H. and Dick, A. and van den Hengel, A.J.", TITLE = "Ask Me Anything: Free-Form Visual Question Answering Based on Knowledge from External Sources", BOOKTITLE = CVPR16, YEAR = "2016", PAGES = "4622-4630", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202166"} @article{bb207055, AUTHOR = "Tommasi, T. and Mallya, A. and Plummer, B.A. and Lazebnik, S. and Berg, A.C. and Berg, T.L.", TITLE = "Combining Multiple Cues for Visual Madlibs Question Answering", JOURNAL = IJCV, VOLUME = "127", YEAR = "2019", NUMBER = "1", MONTH = "January", PAGES = "38-60", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202167"} @inproceedings{bb207056, AUTHOR = "Tommasi, T. and Mallya, A. and Plummer, B.A. and Lazebnik, S. and Berg, A.C. and Berg, T.L.", TITLE = "Solving Visual Madlibs with Multiple Cues", BOOKTITLE = BMVC16, YEAR = "2016", PAGES = "xx-yy", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202168"} @inproceedings{bb207057, AUTHOR = "Yu, L.C. and Park, E. and Berg, A.C. and Berg, T.L.", TITLE = "Visual Madlibs: Fill in the Blank Description Generation and Question Answering", BOOKTITLE = ICCV15, YEAR = "2015", PAGES = "2461-2469", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202169"} @article{bb207058, AUTHOR = "Liu, F. and Xiang, T. and Hospedales, T.M. and Yang, W.K. and Sun, C.Y.", TITLE = "Inverse Visual Question Answering: A New Benchmark and VQA Diagnosis Tool", JOURNAL = PAMI, VOLUME = "42", YEAR = "2020", NUMBER = "2", MONTH = "February", PAGES = "460-474", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202170"} @inproceedings{bb207059, AUTHOR = "Liu, F. and Xiang, T. and Hospedales, T.M. and Yang, W.K. and Sun, C.Y.", TITLE = "iVQA: Inverse Visual Question Answering", BOOKTITLE = CVPR18, YEAR = "2018", PAGES = "8611-8619", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202171"} @article{bb207060, AUTHOR = "Patil, C. and Patwardhan, M.", TITLE = "Visual Question Generation: The State of the Art", JOURNAL = Surveys, VOLUME = "53", YEAR = "2020", NUMBER = "3", MONTH = "May", PAGES = "xx-yy", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202172"} @article{bb207061, AUTHOR = "He, F.J. and Wang, Y.X. and Miao, X.L. and Sun, X.", TITLE = "Interpretable visual reasoning: A survey", JOURNAL = IVC, VOLUME = "112", YEAR = "2021", PAGES = "104194", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202173"} @article{bb207062, AUTHOR = "Sharma, H. and Jalal, A.S.", TITLE = "A survey of methods, datasets and evaluation metrics for visual question answering", JOURNAL = IVC, VOLUME = "116", YEAR = "2021", PAGES = "104327", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202174"} @article{bb207063, AUTHOR = "Yang, L. and Jiang, H. and Song, Q. and Guo, J.", TITLE = "A Survey on Long-Tailed Visual Recognition", JOURNAL = IJCV, VOLUME = "130", YEAR = "2022", NUMBER = "7", MONTH = "July", PAGES = "1837-1872", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202175"} @article{bb207064, AUTHOR = "Zhao, W.L. and Rao, Y.M. and Tang, Y.S. and Zhou, J. and Lu, J.W.", TITLE = "VideoABC: A Real-World Video Dataset for Abductive Visual Reasoning", JOURNAL = IP, VOLUME = "31", YEAR = "2022", PAGES = "6048-6061", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202176"} @article{bb207065, AUTHOR = "Lahouti, F. and Kostina, V. and Hassibi, B.", TITLE = "How to Query an Oracle? Efficient Strategies to Label Data", JOURNAL = PAMI, VOLUME = "44", YEAR = "2022", NUMBER = "11", MONTH = "November", PAGES = "7597-7609", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202177"} @inproceedings{bb207066, AUTHOR = "Zhu, L. and Ning, R. and Li, J. and Xin, C.S. and Wu, H.Y.", TITLE = "Most and Least Retrievable Images in Visual-Language Query Systems", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVII:1-18", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202178"} @inproceedings{bb207067, AUTHOR = "Salewski, L. and Emde, C. and Do, V. and Akata, Z. and Lukasiewicz, T.", TITLE = "e-ViL: A Dataset and Benchmark for Natural Language Explanations in Vision-Language Tasks", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1224-1234", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202179"} @inproceedings{bb207068, AUTHOR = "Gupta, V. and Patro, B.N. and Parihar, H. and Namboodiri, V.P.", TITLE = "VQuAD: Video Question Answering Diagnostic Dataset", BOOKTITLE = Novelty22, YEAR = "2022", PAGES = "282-291", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202180"} @inproceedings{bb207069, AUTHOR = "Nishimura, T. and Sakoda, K. and Hashimoto, A. and Ushiku, Y. and Tanaka, N. and Ono, F. and Kameko, H. and Mori, S.", TITLE = "Egocentric Biochemical Video-and-Language Dataset", BOOKTITLE = CLVL21, YEAR = "2021", PAGES = "3122-3126", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202181"} @inproceedings{bb207070, AUTHOR = "Zhang, M. and Maidment, T. and Diab, A. and Kovashka, A. and Hwa, R.", TITLE = "Domain-robust VQA with diverse datasets and methods but no target labels", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "7042-7052", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202182"} @inproceedings{bb207071, AUTHOR = "Mathew, M. and Karatzas, D. and Jawahar, C.V.", TITLE = "DocVQA: A Dataset for VQA on Document Images", BOOKTITLE = WACV21, YEAR = "2021", PAGES = "2199-2208", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202183"} @inproceedings{bb207072, AUTHOR = "Patel, D. and Parikh, R. and Shastri, Y.", TITLE = "Recent Advances in Video Question Answering: A Review of Datasets and Methods", BOOKTITLE = VTIUR20, YEAR = "2020", PAGES = "339-356", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202184"} @inproceedings{bb207073, AUTHOR = "Fan, C.", TITLE = "EgoVQA: An Egocentric Video Question Answering Benchmark Dataset", BOOKTITLE = EPIC19, YEAR = "2019", PAGES = "4359-4366", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202185"} @inproceedings{bb207074, AUTHOR = "Hudson, D.A. and Manning, C.D.", TITLE = "GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "6693-6702", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202186"} @inproceedings{bb207075, AUTHOR = "Yang, G.Y.R. and Ganichev, I. and Wang, X.J. and Shlens, J. and Sussillo, D.", TITLE = "A Dataset and Architecture for Visual Reasoning with a Working Memory", BOOKTITLE = ECCV18, YEAR = "2018", PAGES = "X: 729-745", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202187"} @inproceedings{bb207076, AUTHOR = "Gan, C. and Li, Y. and Li, H. and Sun, C. and Gong, B.", TITLE = "VQS: Linking Segmentations to Questions and Answers for Supervised Attention in VQA and Question-Focused Semantic Segmentation", BOOKTITLE = ICCV17, YEAR = "2017", PAGES = "1829-1838", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202188"} @inproceedings{bb207077, AUTHOR = "Maharaj, T. and Ballas, N. and Rohrbach, A. and Courville, A. and Pal, C.", TITLE = "A Dataset and Exploration of Models for Understanding Video Data through Fill-in-the-Blank Question-Answering", BOOKTITLE = CVPR17, YEAR = "2017", PAGES = "7359-7368", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT202189"} @article{bb207078, AUTHOR = "Das, A. and Kottur, S. and Gupta, K. and Singh, A. and Yadav, D. and Lee, S. and Moura, J.M.F. and Parikh, D. and Batra, D.", TITLE = "Visual Dialog", JOURNAL = PAMI, VOLUME = "41", YEAR = "2019", NUMBER = "5", MONTH = "May", PAGES = "1242-1256", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202190"} @article{bb207079, AUTHOR = "Zhao, Z. and Zhang, Z. and Jiang, X.H. and Cai, D.", TITLE = "Multi-Turn Video Question Answering via Hierarchical Attention Context Reinforced Networks", JOURNAL = IP, VOLUME = "28", YEAR = "2019", NUMBER = "8", MONTH = "August", PAGES = "3860-3872", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202191"} @article{bb207080, AUTHOR = "Gu, M. and Zhao, Z. and Jin, W. and Cai, D. and Wu, F.", TITLE = "Video Dialog via Multi-Grained Convolutional Self-Attention Context Multi-Modal Networks", JOURNAL = CirSysVideo, VOLUME = "30", YEAR = "2020", NUMBER = "12", MONTH = "December", PAGES = "4453-4466", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202192"} @article{bb207081, AUTHOR = "Guo, D. and Wang, H. and Wang, S. and Wang, M.", TITLE = "Textual-Visual Reference-Aware Attention Network for Visual Dialog", JOURNAL = IP, VOLUME = "29", YEAR = "2020", PAGES = "6655-6666", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202193"} @article{bb207082, AUTHOR = "Patro, B.N. and Anupriy and Namboodiri, V.P.", TITLE = "Probabilistic framework for solving visual dialog", JOURNAL = PR, VOLUME = "110", YEAR = "2021", PAGES = "107586", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202194"} @article{bb207083, AUTHOR = "Zhao, L. and Lyu, X.Y. and Song, J.K. and Gao, L.L.", TITLE = "GuessWhich? Visual dialog with attentive memory network", JOURNAL = PR, VOLUME = "114", YEAR = "2021", PAGES = "107823", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202195"} @article{bb207084, AUTHOR = "Jiang, T.L. and Shao, H.L. and Tian, X. and Ji, Y. and Liu, C.P.", TITLE = "Aligning vision-language for graph inference in visual dialog", JOURNAL = IVC, VOLUME = "116", YEAR = "2021", PAGES = "104316", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202196"} @article{bb207085, AUTHOR = "Guo, D. and Wang, H. and Wang, M.", TITLE = "Context-Aware Graph Inference With Knowledge Distillation for Visual Dialog", JOURNAL = PAMI, VOLUME = "44", YEAR = "2022", NUMBER = "10", MONTH = "October", PAGES = "6056-6073", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202197"} @inproceedings{bb207086, AUTHOR = "Guo, D. and Wang, H. and Zhang, H.W. and Zha, Z.J. and Wang, M.", TITLE = "Iterative Context-Aware Graph Inference for Visual Dialog", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "10052-10061", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202198"} @article{bb207087, AUTHOR = "Patro, B.N. and Anupriy and Namboodiri, V.P.", TITLE = "Explanation vs. attention: A two-player game to obtain attention for VQA and visual dialog", JOURNAL = PR, VOLUME = "132", YEAR = "2022", PAGES = "108898", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202199"} @article{bb207088, AUTHOR = "Zhu, Y. and Wu, Y. and Yang, Y. and Yan, Y.", TITLE = "Saying the Unseen: Video Descriptions via Dialog Agents", JOURNAL = PAMI, VOLUME = "44", YEAR = "2022", NUMBER = "10", MONTH = "October", PAGES = "7190-7204", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202200"} @article{bb207089, AUTHOR = "Huang, Y. and Wang, Y.M. and Wang, L.", TITLE = "Efficient Image and Sentence Matching", JOURNAL = PAMI, VOLUME = "45", YEAR = "2023", NUMBER = "3", MONTH = "March", PAGES = "2970-2983", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202201"} @article{bb207090, AUTHOR = "Zhao, L. and Li, J.L. and Gao, L.L. and Rao, Y. and Song, J.K. and Shen, H.T.", TITLE = "Heterogeneous Knowledge Network for Visual Dialog", JOURNAL = CirSysVideo, VOLUME = "33", YEAR = "2023", NUMBER = "2", MONTH = "February", PAGES = "861-871", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202202"} @article{bb207091, AUTHOR = "Bucinca, Z. and Yemez, Y. and Erzin, E. and Sezgin, M.", TITLE = "AffectON: Incorporating Affect Into Dialog Generation", JOURNAL = AffCom, VOLUME = "14", YEAR = "2023", NUMBER = "1", MONTH = "January", PAGES = "823-835", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202203"} @article{bb207092, AUTHOR = "Yu, H. and Ko, Y.J.", TITLE = "Enriching the dialogue state tracking model with a asyntactic discourse graph", JOURNAL = PRL, VOLUME = "169", YEAR = "2023", PAGES = "81-86", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202204"} @article{bb207093, AUTHOR = "Wu, Y.X. and Liao, L. and Zhang, G.Y. and Lei, W.Q. and Zhao, G.S. and Qian, X.M. and Chua, T.S.", TITLE = "State Graph Reasoning for Multimodal Conversational Recommendation", JOURNAL = MultMed, VOLUME = "25", YEAR = "2023", PAGES = "3113-3124", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202205"} @article{bb207094, AUTHOR = "Firdaus, M. and Thangavelu, N. and Ekbal, A. and Bhattacharyya, P.", TITLE = "I Enjoy Writing and Playing, Do You?: A Personalized and Emotion Grounded Dialogue Agent Using Generative Adversarial Network", JOURNAL = AffCom, VOLUME = "14", YEAR = "2023", NUMBER = "3", MONTH = "July", PAGES = "2127-2138", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202206"} @inproceedings{bb207095, AUTHOR = "Madasu, A. and Lal, V.", TITLE = "Is Multimodal Vision Supervision Beneficial to Language?", BOOKTITLE = NFVLR23, YEAR = "2023", PAGES = "2637-2642", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202207"} @inproceedings{bb207096, AUTHOR = "Ashutosh, K. and Girdhar, R. and Torresani, L. and Grauman, K.", TITLE = "HierVL: Learning Hierarchical Video-Language Embeddings", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "23066-23078", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202208"} @inproceedings{bb207097, AUTHOR = "Smith, J.S. and Cascante Bonilla, P. and Arbelle, A. and Kim, D.H. and Panda, R. and Cox, D. and Yang, D. and Kira, Z. and Feris, R. and Karlinsky, L.", TITLE = "ConStruct-VL: Data-Free Continual Structured VL Concepts Learning*", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14994-15004", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202209"} @inproceedings{bb207098, AUTHOR = "Chen, Y.X. and Ma, Z.Y. and Zhang, Z.Q. and Qi, Z.G. and Yuan, C.F. and Shan, Y. and Li, B. and Hu, W.M. and Qie, X. and Wu, J.P.", TITLE = "ViLEM: Visual-Language Error Modeling for Image-Text Retrieval", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "11018-11027", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202210"} @inproceedings{bb207099, AUTHOR = "Huang, J.J. and Li, Y. and Feng, J.S. and Wu, X.L. and Sun, X.S. and Ji, R.R.", TITLE = "Clover: Towards A Unified Video-Language Alignment and Fusion Model", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14856-14866", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT202211"}