@inproceedings{bb229600,
        AUTHOR = "Yao, H.T. and Zhang, R. and Xu, C.S.",
        TITLE = "TCP: Textual-Based Class-Aware Prompt Tuning for Visual-Language
Model",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "23438-23448",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224586"}

@inproceedings{bb229601,
        AUTHOR = "Yang, S. and Tian, Z. and Jiang, L. and Jia, J.Y.",
        TITLE = "Unified Language-Driven Zero-Shot Domain Adaptation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "23407-23415",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224587"}

@inproceedings{bb229602,
        AUTHOR = "Cui, J.Q. and Zhu, B. and Wen, X. and Qi, X.J. and Yu, B. and Zhang, H.W.",
        TITLE = "Classes Are Not Equal: An Empirical Study on Image Recognition
Fairness",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "23283-23292",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224588"}

@inproceedings{bb229603,
        AUTHOR = "Stojnic, V. and Kalantidis, Y. and Tolias, G.",
        TITLE = "Label Propagation for Zero-shot Classification with Vision-Language
Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "23209-23218",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224589"}

@inproceedings{bb229604,
        AUTHOR = "Yuan, T. and Zhang, X. and Liu, K. and Liu, B. and Chen, C. and Jin, J. and Jiao, Z.Z.",
        TITLE = "Towards Surveillance Video-and-Language Understanding: New Dataset,
Baselines, and Challenges",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "22052-22061",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224590"}

@inproceedings{bb229605,
        AUTHOR = "Chen, Y.F. and Chen, D.P. and Liu, R.J. and Zhou, S. and Xue, W.Y. and Peng, W.",
        TITLE = "Align Before Adapt: Leveraging Entity-to-Region Alignments for
Generalizable Video Action Recognition",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18688-18698",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224591"}

@inproceedings{bb229606,
        AUTHOR = "Mittal, H. and Agarwal, N. and Lo, S.Y. and Lee, K.",
        TITLE = "Can't make an Omelette without Breaking some Eggs: Plausible Action
Anticipation using Large Video-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18580-18590",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224592"}

@inproceedings{bb229607,
        AUTHOR = "Kahatapitiya, K. and Arnab, A. and Nagran, A. and Ryoo, M.S.",
        TITLE = "VicTR: Video-conditioned Text Representations for Activity
Recognition",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18547-18558",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224593"}

@inproceedings{bb229608,
        AUTHOR = "Wu, T.Y. and Ho, C.H. and Vasconcelos, N.M.",
        TITLE = "ProTeCt: Prompt Tuning for Taxonomic Open Set Classification",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "16531-16540",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224594"}

@inproceedings{bb229609,
        AUTHOR = "Zhao, G. and Li, G.B. and Chen, W. and Yu, Y.Z.",
        TITLE = "OVER-NAV: Elevating Iterative Vision-and-Language Navigation with
Open-Vocabulary Detection and StructurEd Representation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "16296-16306",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224595"}

@inproceedings{bb229610,
        AUTHOR = "Li, X. and Wu, Y.F. and Jiang, X.H. and Guo, Z.H. and Gong, M.M. and Cao, H.Y. and Liu, Y.S. and Jiang, D.Q. and Sun, X.",
        TITLE = "Enhancing Visual Document Understanding with Contrastive Learning in
Large Visual-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15546-15555",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224596"}

@inproceedings{bb229611,
        AUTHOR = "Pham, K. and Huynh, C. and Lim, S.N. and Shrivastava, A.",
        TITLE = "Composing Object Relations and Attributes for Image-Text Matching",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14354-14363",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224597"}

@inproceedings{bb229612,
        AUTHOR = "Kim, G. and Kim, S. and Lee, S.",
        TITLE = "AAPL: Adding Attributes to Prompt Learning for Vision-Language Models",
        BOOKTITLE = Prompting24,
        YEAR = "2024",
        PAGES = "1572-1582",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224598"}

@inproceedings{bb229613,
        AUTHOR = "Xu, Z. and Zhu, Y. and Deng, S.Q. and Mittal, A. and Chen, Y.B. and Wang, M. and Favaro, P. and Tighe, J. and Modolo, D.",
        TITLE = "Benchmarking Zero-Shot Recognition with Vision-Language Models:
Challenges on Granularity and Specificity",
        BOOKTITLE = WhatNext24,
        YEAR = "2024",
        PAGES = "1827-1836",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224599"}

@inproceedings{bb229614,
        AUTHOR = "Luo, Z.W. and Gustafsson, F.K. and Zhao, Z. and Sjolund, J. and Schon, T.B.",
        TITLE = "Photo-Realistic Image Restoration in the Wild with Controlled
Vision-Language Models",
        BOOKTITLE = NTIRE24,
        YEAR = "2024",
        PAGES = "6641-6651",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224600"}

@inproceedings{bb229615,
        AUTHOR = "Huang, C.Q. and Jiang, A. and Feng, J.H. and Zhang, Y. and Wang, X.C. and Wang, Y.F.",
        TITLE = "Adapting Visual-Language Models for Generalizable Anomaly Detection
in Medical Images",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "11375-11385",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224601"}

@inproceedings{bb229616,
        AUTHOR = "Bang, J. and Ahn, S. and Lee, J.G.",
        TITLE = "Active Prompt Learning in Vision Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26994-27004",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224602"}

@inproceedings{bb229617,
        AUTHOR = "Pan, C. and Yaman, B. and Nesti, T. and Mallik, A. and Allievi, A.G. and Velipasalar, S. and Ren, L.",
        TITLE = "VLP: Vision Language Planning for Autonomous Driving",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14760-14769",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224603"}

@inproceedings{bb229618,
        AUTHOR = "Liang, M. and Su, J.C. and Schulter, S. and Garg, S. and Zhao, S.Y. and Wu, Y. and Chandraker, M.",
        TITLE = "AIDE: An Automatic Data Engine for Object Detection in Autonomous
Driving",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14695-14706",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224604"}

@inproceedings{bb229619,
        AUTHOR = "Li, Z. and Li, X. and Fu, X. and Zhang, X. and Wang, W.Q. and Chen, S. and Yang, J.",
        TITLE = "PromptKD: Unsupervised Prompt Distillation for Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26607-26616",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224605"}

@inproceedings{bb229620,
        AUTHOR = "Khandelwal, A.",
        TITLE = "PromptSync: Bridging Domain Gaps in Vision-Language Models through
Class-Aware Prototype Alignment and Discrimination",
        BOOKTITLE = ZeroShot24,
        YEAR = "2024",
        PAGES = "7819-7828",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224606"}

@inproceedings{bb229621,
        AUTHOR = "Hirohashi, Y. and Hirakawa, T. and Yamashita, T. and Fujiyoshi, H.",
        TITLE = "Prompt Learning with One-Shot Setting based Feature Space Analysis in
Vision-and-Language Models",
        BOOKTITLE = ZeroShot24,
        YEAR = "2024",
        PAGES = "7761-7770",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224607"}

@inproceedings{bb229622,
        AUTHOR = "Zhang, L. and Awal, R. and Agrawal, A.",
        TITLE = "Contrasting Intra-Modal and Ranking Cross-Modal Hard Negatives to
Enhance Visio-Linguistic Compositional Understanding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13774-13784",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224608"}

@inproceedings{bb229623,
        AUTHOR = "Rosasco, A. and Berti, S. and Pasquale, G. and Malafronte, D. and Sato, S. and Segawa, H. and Inada, T. and Natale, L.",
        TITLE = "ConCon-Chi: Concept-Context Chimera Benchmark for Personalized
Vision-Language Tasks",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "22239-22248",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224609"}

@inproceedings{bb229624,
        AUTHOR = "Cheng, S. and Guo, Z.C. and Wu, J. and Fang, K. and Li, P. and Liu, H.P. and Liu, Y.",
        TITLE = "EgoThink: Evaluating First-Person Perspective Thinking Capability of
Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14291-14302",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224610"}

@inproceedings{bb229625,
        AUTHOR = "Guan, T.R. and Liu, F. and Wu, X. and Xian, R.Q. and Li, Z.X. and Liu, X.Y. and Wang, X. and Chen, L. and Huang, F. and Yacoob, Y. and Manocha, D. and Zhou, T.Y.",
        TITLE = "Hallusionbench: An Advanced Diagnostic Suite for Entangled Language
Hallucination and Visual Illusion in Large Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14375-14385",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224611"}

@inproceedings{bb229626,
        AUTHOR = "Kil, J. and Song, C.H. and Zheng, B. and Deng, X. and Su, Y. and Chao, W.L.",
        TITLE = "Dual-View Visual Contextualization for Web Navigation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14445-14454",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224612"}

@inproceedings{bb229627,
        AUTHOR = "Guo, Y.Y. and Wang, G.Z. and Kankanhalli, M.",
        TITLE = "PELA: Learning Parameter-Efficient Models with Low-Rank Approximation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15699-15709",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224613"}

@inproceedings{bb229628,
        AUTHOR = "Cao, J.J. and Ye, P. and Li, S.Z. and Yu, C. and Tang, Y.S. and Lu, J.W. and Chen, T.",
        TITLE = "MADTP: Multimodal Alignment-Guided Dynamic Token Pruning for
Accelerating Vision-Language Transformer",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15710-15719",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224614"}

@inproceedings{bb229629,
        AUTHOR = "Farina, M. and Mancini, M. and Cunegatti, E. and Cunegatti, E. and Iacca, G. and Ricci, E.",
        TITLE = "MULTIFLOW: Shifting Towards Task-Agnostic Vision-Language Pruning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "16185-16195",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224615"}

@inproceedings{bb229630,
        AUTHOR = "Mu, F.Z. and Mo, S.C. and Li, Y.",
        TITLE = "SnAG: Scalable and Accurate Video Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18930-18940",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224616"}

@inproceedings{bb229631,
        AUTHOR = "Cao, Y.H. and Ji, K.X. and Huang, Z.Y. and Zheng, C.Y. and Liu, J.J. and Wang, J. and Chen, J.D. and Yang, M.",
        TITLE = "Towards Better Vision-Inspired Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13537-13547",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224617"}

@inproceedings{bb229632,
        AUTHOR = "Shi, K.Y. and Dong, Q. and Goncalves, L. and Tu, Z.W. and Soatto, S.",
        TITLE = "Non-autoregressive Sequence-to-Sequence Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13603-13612",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224618"}

@inproceedings{bb229633,
        AUTHOR = "Man, Y.Z. and Gui, L.Y. and Wang, Y.X.",
        TITLE = "Situational Awareness Matters in 3D Vision Language Reasoning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13678-13688",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224619"}

@inproceedings{bb229634,
        AUTHOR = "Zheng, C.H. and Zhang, J. and Kembhavi, A. and Krishna, R.",
        TITLE = "Iterated Learning Improves Compositionality in Large Vision-Language
Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13785-13795",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224620"}

@inproceedings{bb229635,
        AUTHOR = "Leng, S. and Zhang, H. and Chen, G.Z. and Li, X. and Lu, S.J. and Miao, C.Y. and Bing, L.",
        TITLE = "Mitigating Object Hallucinations in Large Vision-Language Models
through Visual Contrastive Decoding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13872-13882",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224621"}

@inproceedings{bb229636,
        AUTHOR = "Song, C.H. and Hwang, T. and Yoon, J.Y. and Choi, S. and Gu, Y.H.",
        TITLE = "SyncMask: Synchronized Attentional Masking for Fashion-centric
Vision-Language Pretraining",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13948-13957",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224622"}

@inproceedings{bb229637,
        AUTHOR = "Pramanick, S. and Han, G.X. and Hou, R. and Nag, S. and Lim, S.N. and Ballas, N. and Wang, Q.F. and Chellappa, R. and Almahairi, A.",
        TITLE = "Jack of All Tasks, Master of Many: Designing General-purpose
Coarse-to-Fine Vision-Language Model",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14076-14088",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224623"}

@inproceedings{bb229638,
        AUTHOR = "Zeng, Y. and Huang, Y. and Zhang, J.J. and Jie, Z.Q. and Chai, Z.H. and Wang, L.",
        TITLE = "Investigating Compositional Challenges in Vision-Language Models for
Visual Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14141-14151",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224624"}

@inproceedings{bb229639,
        AUTHOR = "Karmanov, A. and Guan, D. and Lu, S.J. and El Saddik, A. and Xing, E.",
        TITLE = "Efficient Test-Time Adaptation of Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14162-14171",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224625"}

@inproceedings{bb229640,
        AUTHOR = "Sameni, S. and Kafle, K. and Tan, H. and Jenni, S.",
        TITLE = "Building Vision-Language Models on Solid Foundations with Masked
Distillation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14216-14226",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224626"}

@inproceedings{bb229641,
        AUTHOR = "Li, R.J. and Wu, Y. and He, X.M.",
        TITLE = "Learning by Correction: Efficient Tuning Task for Zero-Shot
Generative Vision-Language Reasoning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13428-13437",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224627"}

@inproceedings{bb229642,
        AUTHOR = "Peng, W. and Xie, S.C. and You, Z. and Lan, S.Y. and Wu, Z.X.",
        TITLE = "Synthesize, Diagnose, and Optimize: Towards Fine-Grained
Vision-Language Understanding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13279-13288",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224628"}

@inproceedings{bb229643,
        AUTHOR = "Zhao, Y. and Zhao, L. and Zhou, X.Y. and Wu, J.L. and Chu, C.T. and Miao, H. and Schroff, F. and Adam, H. and Liu, T. and Gong, B.Q. and Krahenbuhl, P. and Yuan, L.Z.",
        TITLE = "Distilling Vision-Language Models on Millions of Videos",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13106-13116",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224629"}

@inproceedings{bb229644,
        AUTHOR = "Chen, J. and Yu, Q.H. and Shen, X.H. and Yuille, A.L. and Chen, L.C.",
        TITLE = "ViTamin: Designing Scalable Vision Models in the Vision-Language Era",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12954-12966",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224630"}

@inproceedings{bb229645,
        AUTHOR = "Liu, S.H. and Yu, S. and Lin, Z.Q. and Pathak, D. and Ramanan, D.",
        TITLE = "Language Models as Black-Box Optimizers for Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12687-12697",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224631"}

@inproceedings{bb229646,
        AUTHOR = "Howard, P. and Madasu, A. and Le, T. and Moreno, G.L. and Bhiwandiwalla, A. and Lal, V.",
        TITLE = "SocialCounterfactuals: Probing and Mitigating Intersectional Social
Biases in Vision-Language Models with Counterfactual Examples",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "11975-11985",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224632"}

@inproceedings{bb229647,
        AUTHOR = "Jiang, Y. and Huang, Z.Z. and Zhang, R.Z. and Zhang, X.F. and Zhang, S.T.",
        TITLE = "ZePT: Zero-Shot Pan-Tumor Segmentation via Query-Disentangling and
Self-Prompting",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "11386-11397",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224633"}

@inproceedings{bb229648,
        AUTHOR = "Kim, Y. and Mo, S. and Kim, M. and Lee, K. and Lee, J. and Shin, J.",
        TITLE = "Discovering and Mitigating Visual Biases Through Keyword Explanation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "11082-11092",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224634"}

@inproceedings{bb229649,
        AUTHOR = "Li, R. and Fischer, T. and Segu, M. and Pollefeys, M. and Van Gool, L.J. and Tombari, F.",
        TITLE = "Know Your Neighbors: Improving Single-View Reconstruction via Spatial
Vision-Language Reasoning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "9848-9858",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224635"}

@inproceedings{bb229650,
        AUTHOR = "Zeng, Z. and Wang, D. and Yang, F.Y. and Park, H. and Soatto, S. and Lao, D. and Wong, A.",
        TITLE = "WorDepth: Variational Language Prior for Monocular Depth Estimation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "9708-9719",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224636"}

@inproceedings{bb229651,
        AUTHOR = "Hu, Y.S. and Stretcu, O. and Lu, C.T. and Viswanathan, K. and Hata, K. and Luo, E. and Krishna, R. and Fuxman, A.",
        TITLE = "Visual Program Distillation: Distilling Tools and Programmatic
Reasoning into Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "9590-9601",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224637"}

@inproceedings{bb229652,
        AUTHOR = "Khan, Z. and Fu, Y.",
        TITLE = "Consistency and Uncertainty: Identifying Unreliable Responses From
Black-Box Vision-Language Models for Selective Visual Question
Answering",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "10854-10863",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224638"}

@inproceedings{bb229653,
        AUTHOR = "Gu, T.C. and Yang, K.C. and Liu, D. and Cai, W.D.",
        TITLE = "LaPA: Latent Prompt Assist Model for Medical Visual Question
Answering",
        BOOKTITLE = DEF-AI-MIA24,
        YEAR = "2024",
        PAGES = "4971-4980",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224639"}

@inproceedings{bb229654,
        AUTHOR = "Silva Rodriguez, J. and Hajimiri, S. and Ben Ayed, I. and Dolz, J.",
        TITLE = "A Closer Look at the Few-Shot Adaptation of Large Vision-Language
Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "23681-23690",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224640"}

@inproceedings{bb229655,
        AUTHOR = "Zanella, M. and Ben Ayed, I.",
        TITLE = "Low-Rank Few-Shot Adaptation of Vision-Language Models",
        BOOKTITLE = Prompting24,
        YEAR = "2024",
        PAGES = "1593-1603",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224641"}

@article{bb229656,
        AUTHOR = "Wang, W.X. and He, X.J. and Zhang, Y. and Guo, L.T. and Shen, J.C. and Li, J.Y. and Liu, J.",
        TITLE = "CM-MaskSD: Cross-Modality Masked Self-Distillation for Referring
Image Segmentation",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "6906-6916",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224642"}

@inproceedings{bb229657,
        AUTHOR = "Sahin, U. and Li, H. and Khan, Q. and Cremers, D. and Tresp, V.",
        TITLE = "Enhancing Multimodal Compositional Reasoning of Visual Language
Models with Generative Negative Mining",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5551-5561",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224643"}

@inproceedings{bb229658,
        AUTHOR = "Yang, C. and Xu, R. and Guo, Y. and Huang, P.X. and Chen, Y. and Ding, W. and Wang, Z.Y. and Zhou, H.",
        TITLE = "Improving Vision-and-Language Reasoning via Spatial Relations
Modeling",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "758-767",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224644"}

@inproceedings{bb229659,
        AUTHOR = "Shen, S. and Yang, S. and Zhang, T.J. and Zhai, B. and Gonzalez, J.E. and Keutzer, K. and Darrell, T.J.",
        TITLE = "Multitask Vision-Language Prompt Tuning",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5644-5655",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224645"}

@inproceedings{bb229660,
        AUTHOR = "Zhang, G. and Zhang, Y.R. and Zhang, K. and Tresp, V.",
        TITLE = "Can Vision-Language Models be a Good Guesser? Exploring VLMs for
Times and Location Reasoning",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "625-634",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224646"}

@inproceedings{bb229661,
        AUTHOR = "Feinglass, J. and Yang, Y.Z.",
        TITLE = "Towards Addressing the Misalignment of Object Proposal Evaluation for
Vision-Language Tasks via Semantic Grounding",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "4385-4395",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224647"}

@inproceedings{bb229662,
        AUTHOR = "Nadeem, A. and Hilton, A. and Dawes, R. and Thomas, G. and Mustafa, A.",
        TITLE = "CAD: Contextual Multi-modal Alignment for Dynamic AVQA",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "7236-7248",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224648"}

@inproceedings{bb229663,
        AUTHOR = "Wu, W. and Li, Q. and Zhong, W.L. and Huang, J.Z.",
        TITLE = "MIVC: Multiple Instance Visual Component for Visual-Language Models",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "8102-8111",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224649"}

@inproceedings{bb229664,
        AUTHOR = "Ganz, R. and Nuriel, O. and Aberdam, A. and Kittenplon, Y. and Mazor, S. and Litman, R.",
        TITLE = "Towards Models that Can See and Read",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "21661-21671",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224650"}

@inproceedings{bb229665,
        AUTHOR = "Zhang, H. and Liu, D. and Lv, Z. and Su, B. and Tao, D.C.",
        TITLE = "Exploring Temporal Concurrency for Video-Language Representation
Learning",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15522-15532",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224651"}

@inproceedings{bb229666,
        AUTHOR = "Shukor, M. and Dancette, C. and Cord, M.",
        TITLE = "eP-ALM: Efficient Perceptual Augmentation of Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "21999-22012",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224652"}

@inproceedings{bb229667,
        AUTHOR = "Schulter, S. and Kumar, B.G.V. and Suh, Y.M. and Dafnis, K.M. and Zhang, Z.X. and Zhao, S.Y. and Metaxas, D.N.",
        TITLE = "OmniLabel: A Challenging Benchmark for Language-Based Object
Detection",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "11919-11928",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224653"}

@inproceedings{bb229668,
        AUTHOR = "Chen, Z.L. and Huang, X. and Guan, Q.L. and Lin, L. and Luo, W.Q.",
        TITLE = "A Retrospect to Multi-prompt Learning across Vision and Language",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "22133-22144",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224654"}

@inproceedings{bb229669,
        AUTHOR = "Derakhshani, M.M. and Sanchez, E. and Bulat, A. and da Costa, V.G.T. and Snoek, C.G.M. and Tzimiropoulos, G. and Martinez, B.",
        TITLE = "Bayesian Prompt Learning for Image-Language Model Generalization",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15191-15200",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224655"}

@inproceedings{bb229670,
        AUTHOR = "Cascante Bonilla, P. and Shehada, K. and Smith, J.S. and Doveh, S. and Kim, D.H. and Panda, R. and Varol, G. and Oliva, A. and Ordonez, V. and Feris, R.S. and Karlinsky, L.",
        TITLE = "Going Beyond Nouns With Vision & Language Models Using Synthetic
Data",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "20098-20108",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224656"}

@inproceedings{bb229671,
        AUTHOR = "Upadhyay, U. and Karthik, S. and Mancini, M. and Akata, Z.",
        TITLE = "ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "1899-1910",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224657"}

@inproceedings{bb229672,
        AUTHOR = "Bitton Guetta, N. and Bitton, Y. and Hessel, J. and Schmidt, L. and Elovici, Y. and Stanovsky, G. and Schwartz, R.",
        TITLE = "Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of
Synthetic and Compositional Images",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2616-2627",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224658"}

@inproceedings{bb229673,
        AUTHOR = "Hu, Z.Y. and Li, Y. and Lyu, M.R. and Wang, L.W.",
        TITLE = "VL-PET: Vision-and-Language Parameter-Efficient Tuning via
Granularity Control",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2998-3008",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224659"}

@inproceedings{bb229674,
        AUTHOR = "Slyman, E. and Kahng, M. and Lee, S.",
        TITLE = "VLSlice: Interactive Vision-and-Language Slice Discovery",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15245-15255",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224660"}

@inproceedings{bb229675,
        AUTHOR = "Najibi, M. and Ji, J.W. and Zhou, Y. and Qi, C.R. and Yan, X.C. and Ettinger, S. and Anguelov, D.",
        TITLE = "Unsupervised 3D Perception with 2D Vision-Language Distillation for
Autonomous Driving",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "8568-8578",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224661"}

@inproceedings{bb229676,
        AUTHOR = "Xu, H. and Xie, S. and Huang, P.Y. and Yu, L.C. and Howes, R. and Ghosh, G. and Zettlemoyer, L. and Feichtenhofer, C.",
        TITLE = "CiT: Curation in Training for Effective Vision-Language Data",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15134-15143",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224662"}

@inproceedings{bb229677,
        AUTHOR = "Trager, M. and Perera, P. and Zancato, L. and Achille, A. and Bhatia, P. and Soatto, S.",
        TITLE = "Linear Spaces of Meanings: Compositional Structures in
Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15349-15358",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224663"}

@inproceedings{bb229678,
        AUTHOR = "Chen, Y.S. and Song, Y.Z. and Yeo, C.Y. and Liu, B. and Fu, J.L. and Shuai, H.H.",
        TITLE = "SINC: Self-Supervised In-Context Learning for Vision-Language Tasks",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15384-15396",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224664"}

@inproceedings{bb229679,
        AUTHOR = "Wu, C.E. and Tian, Y. and Yu, H.C. and Wang, H. and Morgado, P. and Hu, Y.H. and Yang, L.J.",
        TITLE = "Why Is Prompt Tuning for Vision-Language Models Robust to Noisy
Labels?",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15442-15451",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224665"}

@inproceedings{bb229680,
        AUTHOR = "Ouali, Y. and Bulat, A. and Matinez, B. and Tzimiropoulos, G.",
        TITLE = "Black Box Few-Shot Adaptation for Vision-Language models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15488-15500",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224666"}

@inproceedings{bb229681,
        AUTHOR = "Kan, B. and Wang, T. and Lu, W.P. and Zhen, X.T. and Guan, W. and Zheng, F.",
        TITLE = "Knowledge-Aware Prompt Tuning for Generalizable Vision-Language
Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15624-15634",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224667"}

@inproceedings{bb229682,
        AUTHOR = "Zhai, J.T. and Zhang, Q. and Wu, T. and Chen, X.Y. and Liu, J.J. and Cheng, M.M.",
        TITLE = "SLAN: Self-Locator Aided Network for Vision-Language Understanding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "21892-21901",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224668"}

@inproceedings{bb229683,
        AUTHOR = "Long, S. and Zhao, Z. and Yuan, J. and Tan, Z.C. and Liu, J.J. and Zhou, L.P. and Wang, S.S. and Wang, J.D.",
        TITLE = "Task-Oriented Multi-Modal Mutual Learning for Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "21902-21912",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224669"}

@inproceedings{bb229684,
        AUTHOR = "Cho, E. and Kim, J. and Kim, H.W.J.",
        TITLE = "Distribution-Aware Prompt Tuning for Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "21947-21956",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224670"}

@inproceedings{bb229685,
        AUTHOR = "Varma, M. and Delbrouck, J.B. and Hooper, S. and Chaudhari, A. and Langlotz, C.",
        TITLE = "ViLLA: Fine-Grained Vision-Language Representation Learning from
Real-World Data",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "22168-22178",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224671"}

@inproceedings{bb229686,
        AUTHOR = "Zhu, H.G. and Wei, Y.C. and Liang, X.D. and Zhang, C.J. and Zhao, Y.",
        TITLE = "CTP: Towards Vision-Language Continual Pretraining via Compatible
Momentum Contrast and Topology Preservation",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "22200-22210",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224672"}

@inproceedings{bb229687,
        AUTHOR = "Hu, Z.Z. and Zhu, X.L. and Tran, S. and Vidal, R. and Dhua, A.",
        TITLE = "ProVLA: Compositional Image Search with Progressive Vision-Language
Alignment and Multimodal Fusion",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2764-2769",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224673"}

@inproceedings{bb229688,
        AUTHOR = "Hall, M. and Gustafson, L. and Adcock, A. and Misra, I. and Ross, C.",
        TITLE = "Vision-Language Models Performing Zero-Shot Tasks Exhibit Disparities
Between Gender Groups",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2770-2777",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224674"}

@inproceedings{bb229689,
        AUTHOR = "Agnolucci, L. and Baldrati, A. and Todino, F. and Becattini, F. and Bertini, M. and del Bimbo, A.",
        TITLE = "ECO: Ensembling Context Optimization for Vision-Language Models",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2803-2807",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224675"}

@inproceedings{bb229690,
        AUTHOR = "Palit, V. and Pandey, R. and Arora, A. and Liang, P.P.",
        TITLE = "Towards Vision-Language Mechanistic Interpretability: A Causal
Tracing Tool for BLIP",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2848-2853",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224676"}

@inproceedings{bb229691,
        AUTHOR = "Sammani, F. and Deligiannis, N.",
        TITLE = "Uni-NLX: Unifying Textual Explanations for Vision and Vision-Language
Tasks",
        BOOKTITLE = VLAR23,
        YEAR = "2023",
        PAGES = "4636-4641",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224677"}

@inproceedings{bb229692,
        AUTHOR = "Lee, D.J. and Song, S. and Suh, J. and Choi, J. and Lee, S. and Kim, H.W.J.",
        TITLE = "Read-only Prompt Optimization for Vision-Language Few-shot Learning",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "1401-1411",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224678"}

@inproceedings{bb229693,
        AUTHOR = "Li, X. and Fang, Y.H. and Liu, M.H. and Ling, Z. and Tu, Z.W. and Su, H.",
        TITLE = "Distilling Large Vision-Language Model with Out-of-Distribution
Generalizability",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2492-2503",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224679"}

@inproceedings{bb229694,
        AUTHOR = "Li, J.C. and Gao, M. and Wei, L.H. and Tang, S.L. and Zhang, W.Q. and Li, M.Z. and Ji, W. and Tian, Q. and Chua, T.S. and Zhuang, Y.T.",
        TITLE = "Gradient-Regulated Meta-Prompt Learning for Generalizable
Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2551-2562",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224680"}

@inproceedings{bb229695,
        AUTHOR = "Bi, J.Y. and Cheng, D. and Yao, P. and Pang, B. and Zhan, Y.F. and Yang, C.G. and Wang, Y.J. and Sun, H. and Deng, W.W. and Zhang, Q.",
        TITLE = "VL-Match: Enhancing Vision-Language Pretraining with Token-Level and
Instance-Level Matching",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2584-2593",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224681"}

@inproceedings{bb229696,
        AUTHOR = "Udandarao, V. and Gupta, A. and Albanie, S.",
        TITLE = "SuS-X: Training-Free Name-Only Transfer of Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2725-2736",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224682"}

@inproceedings{bb229697,
        AUTHOR = "Jiang, C.Y. and Xu, H.Y. and Ye, W. and Ye, Q.H. and Li, C.L. and Yan, M. and Bi, B. and Zhang, S.K. and Huang, F. and Huang, S.F.",
        TITLE = "BUS: Efficient and Effective Vision-language Pre-training with
Bottom-Up Patch Summarization",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2888-2898",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224683"}

@inproceedings{bb229698,
        AUTHOR = "Shi, C. and Yang, S.",
        TITLE = "LoGoPrompt: Synthetic Text Images Can Be Good Visual Prompts for
Vision-Language Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2920-2929",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224684"}

@inproceedings{bb229699,
        AUTHOR = "Wang, A.J.P. and Lin, K.Q. and Zhang, D.J.H. and Lei, S.W.X. and Shou, M.Z.",
        TITLE = "Too Large; Data Reduction for Vision-Language Pre-Training",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "3124-3134",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT224685"}

Last update:May 14, 2025 at 16:05:19