@inproceedings{bb213000,
        AUTHOR = "Bolanos, M. and Peris, A. and Casacuberta, F. and Radeva, P.",
        TITLE = "VIBIKNet: Visual Bidirectional Kernelized Network for Visual Question
Answering",
        BOOKTITLE = IbPRIA17,
        YEAR = "2017",
        PAGES = "372-380",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT208076"}

@inproceedings{bb213001,
        AUTHOR = "Kafle, K. and Kanan, C.",
        TITLE = "An Analysis of Visual Question Answering Algorithms",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "1983-1991",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT208077"}

@inproceedings{bb213002,
        AUTHOR = "Kafle, K. and Kanan, C.",
        TITLE = "Answer-Type Prediction for Visual Question Answering",
        BOOKTITLE = CVPR16,
        YEAR = "2016",
        PAGES = "4976-4984",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT208078"}

@inproceedings{bb213003,
        AUTHOR = "Wang, P. and Wu, Q. and Shen, C. and van den Hengel, A.J.",
        TITLE = "The VQA-Machine: Learning How to Use Existing Vision Algorithms to
Answer New Questions",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "3909-3918",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT208079"}

@inproceedings{bb213004,
        AUTHOR = "Yu, D. and Fu, J. and Mei, T. and Rui, Y.",
        TITLE = "Multi-level Attention Networks for Visual Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "4187-4195",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT208080"}

@inproceedings{bb213005,
        AUTHOR = "Ramakrishnan, S.K. and Pal, A. and Sharma, G. and Mittal, A.",
        TITLE = "An Empirical Evaluation of Visual Question Answering for Novel
Objects",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "7312-7321",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT208081"}

@article{bb213006,
        AUTHOR = "Tamaazousti, Y. and Le Borgne, H. and Popescu, A. and Gadeski, E. and Ginsca, A. and Hudelot, C.",
        TITLE = "Vision-language integration using constrained local semantic features",
        JOURNAL = CVIU,
        VOLUME = "163",
        YEAR = "2017",
        NUMBER = "1",
        PAGES = "41-57",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208082"}

@article{bb213007,
        AUTHOR = "Gouthaman, K.V. and Nambiar, A. and Srinivas, K.S. and Mittal, A.",
        TITLE = "Linguistically-aware attention for reducing the semantic gap in
vision-language tasks",
        JOURNAL = PR,
        VOLUME = "112",
        YEAR = "2021",
        PAGES = "107812",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208083"}

@article{bb213008,
        AUTHOR = "Zhou, K.Y. and Yang, J.K. and Loy, C.C. and Liu, Z.W.",
        TITLE = "Learning to Prompt for Vision-Language Models",
        JOURNAL = IJCV,
        VOLUME = "130",
        YEAR = "2022",
        NUMBER = "9",
        MONTH = "September",
        PAGES = "2337-2348",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208084"}

@inproceedings{bb213009,
        AUTHOR = "Zhou, K.Y. and Yang, J.K. and Loy, C.C. and Liu, Z.W.",
        TITLE = "Conditional Prompt Learning for Vision-Language Models",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16795-16804",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208085"}

@article{bb213010,
        AUTHOR = "Ma, C.C. and Liu, Y. and Deng, J.K. and Xie, L.X. and Dong, W.M. and Xu, C.S.",
        TITLE = "Understanding and Mitigating Overfitting in Prompt Tuning for
Vision-Language Models",
        JOURNAL = CirSysVideo,
        VOLUME = "33",
        YEAR = "2023",
        NUMBER = "9",
        MONTH = "September",
        PAGES = "4616-4629",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208086"}

@article{bb213011,
        AUTHOR = "Zhu, Y.Q. and Li, X.Y. and Zheng, M. and Yang, J.H. and Wang, Z. and Guo, X.Q. and Chai, Z.F. and Yuan, Y.C. and Jiang, S.Q.",
        TITLE = "Focus and Align: Learning Tube Tokens for Video-Language Pre-Training",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "8036-8050",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208087"}

@article{bb213012,
        AUTHOR = "Chen, C.Q. and Han, D. and Chang, C.C.",
        TITLE = "MPCCT: Multimodal vision-language learning paradigm with
context-based compact Transformer",
        JOURNAL = PR,
        VOLUME = "147",
        YEAR = "2024",
        PAGES = "110084",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208088"}

@article{bb213013,
        AUTHOR = "Wu, W.H. and Sun, Z. and Song, Y.X. and Wang, J.D. and Ouyang, W.L.",
        TITLE = "Transferring Vision-Language Models for Visual Recognition:
A Classifier Perspective",
        JOURNAL = IJCV,
        VOLUME = "132",
        YEAR = "2024",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "392-409",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208089"}

@article{bb213014,
        AUTHOR = "Ming, Y.F. and Li, Y.X.",
        TITLE = "How Does Fine-Tuning Impact Out-of-Distribution Detection for
Vision-Language Models?",
        JOURNAL = IJCV,
        VOLUME = "132",
        YEAR = "2024",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "596-609",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208090"}

@article{bb213015,
        AUTHOR = "Zhao, C.R. and Wang, Y. and Jiang, X.Y. and Shen, Y.F. and Song, K. and Li, D.S. and Miao, D.Q.",
        TITLE = "Learning Domain Invariant Prompt for Vision-Language Models",
        JOURNAL = IP,
        VOLUME = "33",
        YEAR = "2024",
        PAGES = "1348-1360",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208091"}

@article{bb213016,
        AUTHOR = "Yang, X.F. and Liu, F. and Lin, G.S.",
        TITLE = "Neural Logic Vision Language Explainer",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "3331-3340",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208092"}

@article{bb213017,
        AUTHOR = "Wang, Y.D. and Yu, Z.O. and Wang, J.D. and Heng, Q. and Chen, H. and Ye, W. and Xie, R. and Xie, X. and Zhang, S.K.",
        TITLE = "Exploring Vision-Language Models for Imbalanced Learning",
        JOURNAL = IJCV,
        VOLUME = "132",
        YEAR = "2024",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "224-237",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208093"}

@article{bb213018,
        AUTHOR = "Yu, Z.T. and Zhao, J. and Guo, C.L. and Yang, Y.",
        TITLE = "StableNet: Distinguishing the hard samples to overcome language
priors in visual question answering",
        JOURNAL = IET-CV,
        VOLUME = "18",
        YEAR = "2024",
        NUMBER = "2",
        PAGES = "315-327",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208094"}

@article{bb213019,
        AUTHOR = "Zeng, Y. and Zhang, X. and Li, H. and Wang, J.W. and Zhang, J.P. and Zhou, W.",
        TITLE = "X2-VLM: All-in-One Pre-Trained Model for Vision-Language Tasks",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "3156-3168",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208095"}

@article{bb213020,
        AUTHOR = "Zheng, Y.Z. and Zhong, B. and Liang, Q.H. and Li, G.R. and Ji, R.R. and Li, X.X.",
        TITLE = "Toward Unified Token Learning for Vision-Language Tracking",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "2125-2135",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208096"}

@article{bb213021,
        AUTHOR = "Ye, P. and Xiao, G. and Liu, J.",
        TITLE = "Multimodal Features Alignment for Vision-Language Object Tracking",
        JOURNAL = RS,
        VOLUME = "16",
        YEAR = "2024",
        NUMBER = "7",
        PAGES = "1168",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208097"}

@inproceedings{bb213022,
        AUTHOR = "Ganz, R. and Nuriel, O. and Aberdam, A. and Kittenplon, Y. and Mazor, S. and Litman, R.",
        TITLE = "Towards Models that Can See and Read",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "21661-21671",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208098"}

@inproceedings{bb213023,
        AUTHOR = "Zhang, H. and Liu, D. and Lv, Z. and Su, B. and Tao, D.C.",
        TITLE = "Exploring Temporal Concurrency for Video-Language Representation
Learning",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15522-15532",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208099"}

@inproceedings{bb213024,
        AUTHOR = "Shukor, M. and Dancette, C. and Cord, M.",
        TITLE = "eP-ALM: Efficient Perceptual Augmentation of Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "21999-22012",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208100"}

@inproceedings{bb213025,
        AUTHOR = "Schulter, S. and Kumar, B.G.V. and Suh, Y.M. and Dafnis, K.M. and Zhang, Z.X. and Zhao, S.Y. and Metaxas, D.N.",
        TITLE = "OmniLabel: A Challenging Benchmark for Language-Based Object
Detection",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "11919-11928",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208101"}

@inproceedings{bb213026,
        AUTHOR = "Chen, Z.L. and Huang, X. and Guan, Q.L. and Lin, L. and Luo, W.Q.",
        TITLE = "A Retrospect to Multi-prompt Learning across Vision and Language",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "22133-22144",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208102"}

@inproceedings{bb213027,
        AUTHOR = "Derakhshani, M.M. and Sanchez, E. and Bulat, A. and da Costa, V.G.T. and Snoek, C.G.M. and Tzimiropoulos, G. and Martinez, B.",
        TITLE = "Bayesian Prompt Learning for Image-Language Model Generalization",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15191-15200",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208103"}

@inproceedings{bb213028,
        AUTHOR = "Cascante Bonilla, P. and Shehada, K. and Smith, J.S. and Doveh, S. and Kim, D.H. and Panda, R. and Varol, G. and Oliva, A. and Ordonez, V. and Feris, R.S. and Karlinsky, L.",
        TITLE = "Going Beyond Nouns With Vision & Language Models Using Synthetic
Data",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "20098-20108",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208104"}

@inproceedings{bb213029,
        AUTHOR = "Zara, G. and Conti, A. and Roy, S. and Lathuiliere, S. and Rota, P. and Ricci, E.",
        TITLE = "The Unreasonable Effectiveness of Large Language-Vision Models for
Source-free Video Domain Adaptation",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "10273-10283",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208105"}

@inproceedings{bb213030,
        AUTHOR = "Upadhyay, U. and Karthik, S. and Mancini, M. and Akata, Z.",
        TITLE = "ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "1899-1910",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208106"}

@inproceedings{bb213031,
        AUTHOR = "Chen, Z.H. and Diao, S.Z. and Wang, B. and Li, G.B. and Wan, X.",
        TITLE = "Towards Unifying Medical Vision-and-Language Pre-training via Soft
Prompts",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "23346-23356",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208107"}

@inproceedings{bb213032,
        AUTHOR = "Bitton Guetta, N. and Bitton, Y. and Hessel, J. and Schmidt, L. and Elovici, Y. and Stanovsky, G. and Schwartz, R.",
        TITLE = "Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of
Synthetic and Compositional Images",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2616-2627",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208108"}

@inproceedings{bb213033,
        AUTHOR = "Hu, Z.Y. and Li, Y. and Lyu, M.R. and Wang, L.W.",
        TITLE = "VL-PET: Vision-and-Language Parameter-Efficient Tuning via
Granularity Control",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2998-3008",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208109"}

@inproceedings{bb213034,
        AUTHOR = "Slyman, E. and Kahng, M. and Lee, S.",
        TITLE = "VLSlice: Interactive Vision-and-Language Slice Discovery",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15245-15255",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208110"}

@inproceedings{bb213035,
        AUTHOR = "Najibi, M. and Ji, J.W. and Zhou, Y. and Qi, C.R. and Yan, X.C. and Ettinger, S. and Anguelov, D.",
        TITLE = "Unsupervised 3D Perception with 2D Vision-Language Distillation for
Autonomous Driving",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "8568-8578",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208111"}

@inproceedings{bb213036,
        AUTHOR = "Zheng, K. and Wu, W. and Feng, R. and Zhu, K. and Liu, J.W. and Zhao, D.L. and Zha, Z.J. and Chen, W. and Shen, Y.J.",
        TITLE = "Regularized Mask Tuning: Uncovering Hidden Knowledge in Pre-trained
Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "11629-11639",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208112"}

@inproceedings{bb213037,
        AUTHOR = "Wang, T. and Lin, K. and Li, L.J. and Lin, C.C. and Yang, Z.Y. and Zhang, H.W. and Liu, Z.C. and Wang, L.J.",
        TITLE = "Equivariant Similarity for Vision-Language Foundation Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "11964-11974",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208113"}

@inproceedings{bb213038,
        AUTHOR = "Xu, H. and Xie, S. and Huang, P.Y. and Yu, L.C. and Howes, R. and Ghosh, G. and Zettlemoyer, L. and Feichtenhofer, C.",
        TITLE = "CiT: Curation in Training for Effective Vision-Language Data",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15134-15143",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208114"}

@inproceedings{bb213039,
        AUTHOR = "Trager, M. and Perera, P. and Zancato, L. and Achille, A. and Bhatia, P. and Soatto, S.",
        TITLE = "Linear Spaces of Meanings: Compositional Structures in
Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15349-15358",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208115"}

@inproceedings{bb213040,
        AUTHOR = "Chen, Y.S. and Song, Y.Z. and Yeo, C.Y. and Liu, B. and Fu, J.L. and Shuai, H.H.",
        TITLE = "SINC: Self-Supervised In-Context Learning for Vision-Language Tasks",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15384-15396",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208116"}

@inproceedings{bb213041,
        AUTHOR = "Wu, C.E. and Tian, Y. and Yu, H.C. and Wang, H. and Morgado, P. and Hu, Y.H. and Yang, L.J.",
        TITLE = "Why Is Prompt Tuning for Vision-Language Models Robust to Noisy
Labels?",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15442-15451",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208117"}

@inproceedings{bb213042,
        AUTHOR = "Ouali, Y. and Bulat, A. and Matinez, B. and Tzimiropoulos, G.",
        TITLE = "Black Box Few-Shot Adaptation for Vision-Language models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15488-15500",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208118"}

@inproceedings{bb213043,
        AUTHOR = "Kan, B. and Wang, T. and Lu, W.P. and Zhen, X.T. and Guan, W. and Zheng, F.",
        TITLE = "Knowledge-Aware Prompt Tuning for Generalizable Vision-Language
Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15624-15634",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208119"}

@inproceedings{bb213044,
        AUTHOR = "Zhai, J.T. and Zhang, Q. and Wu, T. and Chen, X.Y. and Liu, J.J. and Cheng, M.M.",
        TITLE = "SLAN: Self-Locator Aided Network for Vision-Language Understanding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "21892-21901",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208120"}

@inproceedings{bb213045,
        AUTHOR = "Long, S. and Zhao, Z. and Yuan, J. and Tan, Z.C. and Liu, J.J. and Zhou, L.P. and Wang, S.S. and Wang, J.D.",
        TITLE = "Task-Oriented Multi-Modal Mutual Learning for Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "21902-21912",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208121"}

@inproceedings{bb213046,
        AUTHOR = "Cho, E. and Kim, J. and Kim, H.W.J.",
        TITLE = "Distribution-Aware Prompt Tuning for Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "21947-21956",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208122"}

@inproceedings{bb213047,
        AUTHOR = "Varma, M. and Delbrouck, J.B. and Hooper, S. and Chaudhari, A. and Langlotz, C.",
        TITLE = "ViLLA: Fine-Grained Vision-Language Representation Learning from
Real-World Data",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "22168-22178",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208123"}

@inproceedings{bb213048,
        AUTHOR = "Zhu, H.G. and Wei, Y.C. and Liang, X.D. and Zhang, C.J. and Zhao, Y.",
        TITLE = "CTP: Towards Vision-Language Continual Pretraining via Compatible
Momentum Contrast and Topology Preservation",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "22200-22210",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208124"}

@inproceedings{bb213049,
        AUTHOR = "Salin, E. and Ayache, S. and Favre, B.",
        TITLE = "Towards an Exhaustive Evaluation of Vision-Language Foundation Models",
        BOOKTITLE = MMFM23,
        YEAR = "2023",
        PAGES = "339-352",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208125"}

@inproceedings{bb213050,
        AUTHOR = "Hu, Z. and Zhu, X.L. and Tran, S. and Vidal, R. and Dhua, A.",
        TITLE = "ProVLA: Compositional Image Search with Progressive Vision-Language
Alignment and Multimodal Fusion",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2764-2769",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208126"}

@inproceedings{bb213051,
        AUTHOR = "Hall, M. and Gustafson, L. and Adcock, A. and Misra, I. and Ross, C.",
        TITLE = "Vision-Language Models Performing Zero-Shot Tasks Exhibit Disparities
Between Gender Groups",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2770-2777",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208127"}

@inproceedings{bb213052,
        AUTHOR = "Agnolucci, L. and Baldrati, A. and Todino, F. and Becattini, F. and Bertini, M. and del Bimbo, A.",
        TITLE = "ECO: Ensembling Context Optimization for Vision-Language Models",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2803-2807",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208128"}

@inproceedings{bb213053,
        AUTHOR = "Palit, V. and Pandey, R. and Arora, A. and Liang, P.P.",
        TITLE = "Towards Vision-Language Mechanistic Interpretability: A Causal
Tracing Tool for BLIP",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2848-2853",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208129"}

@inproceedings{bb213054,
        AUTHOR = "Sammani, F. and Deligiannis, N.",
        TITLE = "Uni-NLX: Unifying Textual Explanations for Vision and Vision-Language
Tasks",
        BOOKTITLE = VLAR23,
        YEAR = "2023",
        PAGES = "4636-4641",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208130"}

@inproceedings{bb213055,
        AUTHOR = "Lu, D. and Wang, Z.Q. and Wang, T. and Guan, W. and Gao, H. and Zheng, F.",
        TITLE = "Set-level Guidance Attack: Boosting Adversarial Transferability of
Vision-Language Pre-training Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "102-111",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208131"}

@inproceedings{bb213056,
        AUTHOR = "Lee, D.J. and Song, S. and Suh, J. and Choi, J. and Lee, S. and Kim, H.W.J.",
        TITLE = "Read-only Prompt Optimization for Vision-Language Few-shot Learning",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "1401-1411",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208132"}

@inproceedings{bb213057,
        AUTHOR = "Li, X. and Fang, Y.H. and Liu, M.H. and Ling, Z. and Tu, Z.W. and Su, H.",
        TITLE = "Distilling Large Vision-Language Model with Out-of-Distribution
Generalizability",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2492-2503",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208133"}

@inproceedings{bb213058,
        AUTHOR = "Li, J.C. and Gao, M. and Wei, L. and Tang, S.L. and Zhang, W.Q. and Li, M. and Ji, W. and Tian, Q. and Chua, T.S. and Zhuang, Y.T.",
        TITLE = "Gradient-Regulated Meta-Prompt Learning for Generalizable
Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2551-2562",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208134"}

@inproceedings{bb213059,
        AUTHOR = "Bi, J.Y. and Cheng, D. and Yao, P. and Pang, B. and Zhan, Y.F. and Yang, C.G. and Wang, Y.J. and Sun, H. and Deng, W.W. and Zhang, Q.",
        TITLE = "VL-Match: Enhancing Vision-Language Pretraining with Token-Level and
Instance-Level Matching",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2584-2593",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208135"}

@inproceedings{bb213060,
        AUTHOR = "Udandarao, V. and Gupta, A. and Albanie, S.",
        TITLE = "SuS-X: Training-Free Name-Only Transfer of Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2725-2736",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208136"}

@inproceedings{bb213061,
        AUTHOR = "Jiang, C. and Xu, H.Y. and Ye, W. and Ye, Q.H. and Li, C.L. and Yan, M. and Bi, B. and Zhang, S.K. and Huang, F. and Huang, S.",
        TITLE = "BUS: Efficient and Effective Vision-language Pre-training with
Bottom-Up Patch Summarization",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2888-2898",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208137"}

@inproceedings{bb213062,
        AUTHOR = "Shi, C. and Yang, S.",
        TITLE = "LoGoPrompt: Synthetic Text Images Can Be Good Visual Prompts for
Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2920-2929",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208138"}

@inproceedings{bb213063,
        AUTHOR = "Wang, A.J.P. and Lin, K.Q. and Zhang, D.J.H. and Lei, S.W.X. and Shou, M.Z.",
        TITLE = "Too Large; Data Reduction for Vision-Language Pre-Training",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "3124-3134",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208139"}

@inproceedings{bb213064,
        AUTHOR = "Wang, W.H. and Yang, Z. and Xu, B. and Li, J. and Sun, Y.",
        TITLE = "ViLTA: Enhancing Vision-Language Pre-training through Textual
Augmentation",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "3135-3146",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208140"}

@inproceedings{bb213065,
        AUTHOR = "Wang, T.J.J. and Laaksonen, J. and Langer, T. and Arponen, H. and Bishop, T.E.",
        TITLE = "Learning by Hallucinating:
Vision-Language Pre-training with Weak Supervision",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "1073-1083",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208141"}

@inproceedings{bb213066,
        AUTHOR = "Boecking, B. and Usuyama, N. and Bannur, S. and Castro, D.C. and Schwaighofer, A. and Hyland, S. and Wetscherek, M. and Naumann, T. and Nori, A. and Alvarez Valle, J. and Poon, H. and Oktay, O.",
        TITLE = "Making the Most of Text Semantics to Improve Biomedical Vision-Language
Processing",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:1-21",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208142"}

@inproceedings{bb213067,
        AUTHOR = "Cui, Q. and Zhou, B. and Guo, Y. and Yin, W.D. and Wu, H. and Yoshie, O. and Chen, Y.",
        TITLE = "Contrastive Vision-Language Pre-training with Limited Resources",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:236-253",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208143"}

@inproceedings{bb213068,
        AUTHOR = "Walmer, M. and Sikka, K. and Sur, I. and Shrivastava, A. and Jha, S.",
        TITLE = "Dual-Key Multimodal Backdoors for Visual Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15354-15364",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208144"}

@inproceedings{bb213069,
        AUTHOR = "Ding, Y. and Yu, J. and Liu, B. and Hu, Y. and Cui, M.X. and Wu, Q.",
        TITLE = "MuKEA: Multimodal Knowledge Extraction and Accumulation for
Knowledge-based Visual Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5079-5088",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208145"}

@inproceedings{bb213070,
        AUTHOR = "Gao, F. and Ping, Q. and Thattai, G. and Reganti, A. and Wu, Y.N. and Natarajan, P.",
        TITLE = "Transform-Retrieve-Generate: Natural Language-Centric
Outside-Knowledge Visual Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5057-5067",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208146"}

@inproceedings{bb213071,
        AUTHOR = "Aflalo, E. and Du, M. and Tseng, S.Y. and Liu, Y.F. and Wu, C. and Duan, N. and Lal, V.",
        TITLE = "VL-InterpreT: An Interactive Visualization Tool for Interpreting
Vision-Language Transformers",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "21374-21383",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208147"}

@inproceedings{bb213072,
        AUTHOR = "Hu, X.W. and Gan, Z. and Wang, J.F. and Yang, Z.Y. and Liu, Z.C. and Lu, Y. and Wang, L.J.",
        TITLE = "Scaling Up Vision-Language Pretraining for Image Captioning",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "17959-17968",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208148"}

@inproceedings{bb213073,
        AUTHOR = "Zhang, P.C. and Li, X.J. and Hu, X.W. and Yang, J.W. and Zhang, L. and Wang, L.J. and Choi, Y.J. and Gao, J.F.",
        TITLE = "VinVL: Revisiting Visual Representations in Vision-Language Models",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "5575-5584",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208149"}

@inproceedings{bb213074,
        AUTHOR = "Li, Z.W. and Stengel Eskin, E. and Zhang, Y.X. and Xie, C. and Tran, Q. and van Durme, B. and Yuille, A.L.",
        TITLE = "Calibrating Concepts and Operations:
Towards Symbolic Reasoning on Real Images",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "14890-14899",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208150"}

@inproceedings{bb213075,
        AUTHOR = "Yang, X. and Zhang, H.W. and Qi, G.J. and Cai, J.F.",
        TITLE = "Causal Attention for Vision-Language Tasks",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "9842-9852",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208151"}

@inproceedings{bb213076,
        AUTHOR = "Stefanini, M. and Cornia, M. and Baraldi, L. and Cucchiara, R.",
        TITLE = "A Novel Attention-based Aggregation Function to Combine Vision and
Language",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "1212-1219",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208152"}

@inproceedings{bb213077,
        AUTHOR = "Jain, V. and Lodhavia, J.",
        TITLE = "Automatic Question Tagging using k-Nearest Neighbors and Random
Forest",
        BOOKTITLE = ISCV20,
        YEAR = "2020",
        PAGES = "1-4",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208153"}

@inproceedings{bb213078,
        AUTHOR = "Zheng, W.B. and Yan, L. and Gou, C. and Wang, F.Y.",
        TITLE = "Webly Supervised Knowledge Embedding Model for Visual Reasoning",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "12442-12451",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208154"}

@inproceedings{bb213079,
        AUTHOR = "Nguyen, D.K. and Okatani, T.",
        TITLE = "Multi-Task Learning of Hierarchical Vision-Language Representation",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "10484-10493",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208155"}

@inproceedings{bb213080,
        AUTHOR = "Gupta, T. and Shih, K.J. and Singh, S. and Hoiem, D.",
        TITLE = "Aligned Image-Word Representations Improve Inductive Transfer Across
Vision-Language Tasks",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "4223-4232",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT208156"}

@article{bb213081,
        AUTHOR = "Wu, Y.C. and Yang, J.C.",
        TITLE = "A Robust Passage Retrieval Algorithm for Video Question Answering",
        JOURNAL = CirSysVideo,
        VOLUME = "18",
        YEAR = "2008",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "1411-1421",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208157"}

@inproceedings{bb213082,
        AUTHOR = "Wu, Y.C. and Lee, Y.S. and Yang, J.C. and Yen, S.J.",
        TITLE = "A New Passage Ranking Algorithm for Video Question Answering",
        BOOKTITLE = PSIVT06,
        YEAR = "2006",
        PAGES = "563-572",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208158"}

@article{bb213083,
        AUTHOR = "Li, G.D. and Li, H.J. and Ming, Z.Y. and Hong, R.C. and Tang, S. and Chua, T.S.",
        TITLE = "Question Answering over Community-Contributed Web Videos",
        JOURNAL = MultMedMag,
        VOLUME = "17",
        YEAR = "2010",
        NUMBER = "4",
        MONTH = "October",
        PAGES = "46-57",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208159"}

@inproceedings{bb213084,
        AUTHOR = "Song, Y.C. and Li, H.J.",
        TITLE = "Mash-Up Approach for Web Video Category Recommendation",
        BOOKTITLE = PSIVT10,
        YEAR = "2010",
        PAGES = "197-202",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208160"}

@article{bb213085,
        AUTHOR = "Guo, Z.Y. and Zhao, Z. and Jin, W. and Wei, Z.C. and Yang, M. and Wang, N.N. and Yuan, N.J.",
        TITLE = "Multi-Turn Video Question Generation via Reinforced Multi-Choice
Attention Network",
        JOURNAL = CirSysVideo,
        VOLUME = "31",
        YEAR = "2021",
        NUMBER = "5",
        PAGES = "1697-1710",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208161"}

@article{bb213086,
        AUTHOR = "Xue, H.Y. and Chu, W. and Zhao, Z. and Cai, D.",
        TITLE = "A Better Way to Attend: Attention With Trees for Video Question
Answering",
        JOURNAL = IP,
        VOLUME = "27",
        YEAR = "2018",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "5563-5574",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208162"}

@article{bb213087,
        AUTHOR = "Xue, H.Y. and Zhao, Z. and Cai, D.",
        TITLE = "Unifying the Video and Question Attentions for Open-Ended Video
Question Answering",
        JOURNAL = IP,
        VOLUME = "26",
        YEAR = "2017",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "5656-5666",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208163"}

@article{bb213088,
        AUTHOR = "Zhao, Z. and Xiao, S.W. and Song, Z. and Lu, C.J. and Xiao, J. and Zhuang, Y.T.",
        TITLE = "Open-Ended Video Question Answering via Multi-Modal Conditional
Adversarial Networks",
        JOURNAL = IP,
        VOLUME = "29",
        YEAR = "2020",
        PAGES = "3859-3870",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208164"}

@article{bb213089,
        AUTHOR = "Zhao, Z. and Zhang, Z. and Xiao, S.W. and Xiao, Z.X. and Yan, X.H. and Yu, J. and Cai, D. and Wu, F.",
        TITLE = "Long-Form Video Question Answering via Dynamic Hierarchical
Reinforced Networks",
        JOURNAL = IP,
        VOLUME = "28",
        YEAR = "2019",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "5939-5952",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208165"}

@article{bb213090,
        AUTHOR = "Yu, T. and Yu, J. and Yu, Z. and Huang, Q.M. and Tian, Q.",
        TITLE = "Long-Term Video Question Answering via Multimodal Hierarchical Memory
Attentive Networks",
        JOURNAL = CirSysVideo,
        VOLUME = "31",
        YEAR = "2021",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "931-944",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208166"}

@article{bb213091,
        AUTHOR = "Jang, Y. and Song, Y. and Kim, C.D. and Yu, Y. and Kim, Y. and Kim, G.",
        TITLE = "Video Question Answering with Spatio-Temporal Reasoning",
        JOURNAL = IJCV,
        VOLUME = "127",
        YEAR = "2019",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "1385-1412",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208167"}

@inproceedings{bb213092,
        AUTHOR = "Jang, Y. and Song, Y. and Yu, Y. and Kim, Y. and Kim, G.",
        TITLE = "TGIF-QA:
Toward Spatio-Temporal Reasoning in Visual Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "1359-1367",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208168"}

@article{bb213093,
        AUTHOR = "Yu, T. and Yu, J. and Yu, Z. and Tao, D.",
        TITLE = "Compositional Attention Networks With Two-Stream Fusion for Video
Question Answering",
        JOURNAL = IP,
        VOLUME = "29",
        YEAR = "2020",
        NUMBER = "",
        PAGES = "1204-1218",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208169"}

@article{bb213094,
        AUTHOR = "Wang, W.N. and Huang, Y. and Wang, L.",
        TITLE = "Long video question answering: A Matching-guided Attention Model",
        JOURNAL = PR,
        VOLUME = "102",
        YEAR = "2020",
        PAGES = "107248",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208170"}

@article{bb213095,
        AUTHOR = "Zhang, W. and Tang, S. and Cao, Y. and Pu, S. and Wu, F. and Zhuang, Y.",
        TITLE = "Frame Augmented Alternating Attention Network for Video Question
Answering",
        JOURNAL = MultMed,
        VOLUME = "22",
        YEAR = "2020",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "1032-1041",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208171"}

@article{bb213096,
        AUTHOR = "Chen, J. and Shao, J. and He, C.",
        TITLE = "Movie fill in the blank by joint learning from video and text with
adaptive temporal attention",
        JOURNAL = PRL,
        VOLUME = "132",
        YEAR = "2020",
        PAGES = "62-68",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208172"}

@article{bb213097,
        AUTHOR = "Wang, A. and Luu, A.T. and Foo, C. and Zhu, H. and Tay, Y. and Chandrasekhar, V.",
        TITLE = "Holistic Multi-Modal Memory Network for Movie Question Answering",
        JOURNAL = IP,
        VOLUME = "29",
        YEAR = "2020",
        NUMBER = "1",
        PAGES = "489-499",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208173"}

@article{bb213098,
        AUTHOR = "Yuan, Z.Q. and Sun, S.Y. and Duan, L.X. and Li, C.S. and Wu, X. and Xu, C.S.",
        TITLE = "Adversarial Multimodal Network for Movie Story Question Answering",
        JOURNAL = MultMed,
        VOLUME = "23",
        YEAR = "2021",
        PAGES = "1744-1756",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208174"}

@article{bb213099,
        AUTHOR = "Gu, M. and Zhao, Z. and Jin, W. and Hong, R. and Wu, F.",
        TITLE = "Graph-Based Multi-Interaction Network for Video Question Answering",
        JOURNAL = IP,
        VOLUME = "30",
        YEAR = "2021",
        PAGES = "2758-2770",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208175"}

Last update:Apr 18, 2024 at 11:38:49