@article{bb241900,
        AUTHOR = "Koo, H. and Shin, J. and Kim, E.",
        TITLE = "Dual-branch scale disentanglement for text-video retrieval",
        JOURNAL = PRL,
        VOLUME = "196",
        YEAR = "2025",
        PAGES = "296-302",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236816"}

@article{bb241901,
        AUTHOR = "Zhou, J. and Wang, M.",
        TITLE = "Unified learning for image-text alignment via multi-scale feature
fusion",
        JOURNAL = CVIU,
        VOLUME = "260",
        YEAR = "2025",
        PAGES = "104468",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236817"}

@article{bb241902,
        AUTHOR = "Wen, J. and Chen, Y.F. and Shi, R.Q. and Ji, W. and Yang, M.L. and Gao, D.F. and Yuan, J.S. and Zimmermann, R.",
        TITLE = "HOVER: Hyperbolic Video-Text Retrieval",
        JOURNAL = IP,
        VOLUME = "34",
        YEAR = "2025",
        PAGES = "6192-6203",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236818"}

@article{bb241903,
        AUTHOR = "Fang, J.Y. and Zhu, B. and Yuan, J.L. and Chen, Y.Y. and Tang, M. and Wang, J.Q.",
        TITLE = "AMITA: Attribute-Guided Masked Image-Text Alignment for Multi-Label
Image Representation",
        JOURNAL = CirSysVideo,
        VOLUME = "35",
        YEAR = "2025",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "11432-11447",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236819"}

@article{bb241904,
        AUTHOR = "Ji, L.L. and Liu, L.",
        TITLE = "Multi-Scale Feature Fusion Based on Piecewise Polynomial Activation
Function for Image-Text Matching",
        JOURNAL = CirSysVideo,
        VOLUME = "35",
        YEAR = "2025",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "11627-11640",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236820"}

@article{bb241905,
        AUTHOR = "Chen, R. and Su, T. and Wang, H. and Ni, Z.K.",
        TITLE = "Similarity Shuffled Criss-Cross Transformer With Angle Loss for
Image-Text Matching",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "9723-9734",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236821"}

@article{bb241906,
        AUTHOR = "Chen, D. and Wang, Y.T. and Xie, Y.Z. and Chen, S.Y. and Peng, W.L. and Tang, M. and Fang, M. and Chen, C.L.P. and Li, P. and Zhang, W.",
        TITLE = "Intra-modal consistency for image-text retrieval through soft-label
distillation",
        JOURNAL = PR,
        VOLUME = "173",
        YEAR = "2026",
        PAGES = "112817",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236822"}

@article{bb241907,
        AUTHOR = "Shi, Z.X. and Ding, Y. and Dong, J.Y. and Zhang, T.Z.",
        TITLE = "Beyond One and Two Tower: Cross-Modal Consensus Learning for
Image-Text Retrieval",
        JOURNAL = CirSysVideo,
        VOLUME = "36",
        YEAR = "2026",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "2581-2593",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236823"}

@article{bb241908,
        AUTHOR = "Wang, H.C. and Liu, L. and Zhang, H.X. and Zhu, L. and Chang, X.J. and Du, H.",
        TITLE = "VisualRAG: Knowledge-Guided Retrieval Augmentation for Image-Text
Matching",
        JOURNAL = CirSysVideo,
        VOLUME = "36",
        YEAR = "2026",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "1234-1248",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236824"}

@article{bb241909,
        AUTHOR = "Zhang, D.L. and Wang, Z.W. and Wu, X.J. and Kittler, J.V.",
        TITLE = "HACG: Leveraging Hierarchical Alignment and Caption Generation for
Text-Video Retrieval",
        JOURNAL = IJCV,
        VOLUME = "134",
        YEAR = "2026",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "93",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236825"}

@inproceedings{bb241910,
        AUTHOR = "Vongala, M.R. and Srivastava, S. and Kosecka, J.",
        TITLE = "Compositional Image-Text Matching and Retrieval by Grounding Entities",
        BOOKTITLE = "MULA25",
        YEAR = "2025",
        PAGES = "241-250",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236826"}

@inproceedings{bb241911,
        AUTHOR = "Zhang, Z.C. and Li, X.Y. and Sun, W. and Zhang, Z.C. and Li, Y.H. and Liu, X.H. and Zhai, G.T.",
        TITLE = "Leveraging Multimodal Large Language Models for Joint Discrete and
Continuous Evaluation in Text-to-Image Alignment",
        BOOKTITLE = NTIRE25,
        YEAR = "2025",
        PAGES = "968-977",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236827"}

@inproceedings{bb241912,
        AUTHOR = "Zhang, Z.J. and Zheng, X.H. and Wu, X.C. and Peng, C. and Cao, X.Z.",
        TITLE = "Tokenfocus-VQA: Enhancing Text-to-Image Alignment with Position-Aware
Focus and Multi-Perspective Aggregations on LVLMs",
        BOOKTITLE = NTIRE25,
        YEAR = "2025",
        PAGES = "1270-1279",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236828"}

@inproceedings{bb241913,
        AUTHOR = "Yue, X. and Sun, J. and Lu, J. and Yao, L.C. and XIA, F. and Wang, T.Y. and Rao, F.Y. and LYU, J. and Deng, Y.",
        TITLE = "Instruction-Augmented Multimodal Alignment for Image-Text and Element
Matching",
        BOOKTITLE = NTIRE25,
        YEAR = "2025",
        PAGES = "1370-1379",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236829"}

@inproceedings{bb241914,
        AUTHOR = "Lai, H. and Xiong, G.X. and Mai, H.Y. and Liu, X. and Zhang, T.Z.",
        TITLE = "Rethinking Noisy Video-Text Retrieval via Relation-aware Alignment",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "9231-9241",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236830"}

@inproceedings{bb241915,
        AUTHOR = "Kim, D. and Piergiovanni, A. and Mallya, G. and Angelova, A.",
        TITLE = "VideoComp: Advancing Fine-Grained Compositional and Temporal
Alignment in Video-Text Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "29060-29070",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236831"}

@inproceedings{bb241916,
        AUTHOR = "Shen, L. and Gong, G.Q. and Hao, T.X. and He, T. and Zhang, Y.F. and Liu, P.Z. and Zhao, S.C. and Han, J.G. and Ding, G.",
        TITLE = "DiscoVLA: Discrepancy Reduction in Vision, Language, and Alignment
for Parameter-Efficient Video-Text Retrieval",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "19702-19712",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236832"}

@inproceedings{bb241917,
        AUTHOR = "Jin, Z.X. and Xu, X.W. and Wang, X.D.",
        TITLE = "MADA:Multi-Window Attention and Dual-Alignment for Image-Text
Retrieval",
        BOOKTITLE = ICIVC24,
        YEAR = "2024",
        PAGES = "240-245",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236833"}

@inproceedings{bb241918,
        AUTHOR = "Xie, C.W. and Sun, S.Y. and Zhao, L.M. and Li, P. and Ma, S. and Zheng, Y.",
        TITLE = "Fuseteacher: Modality-fused Encoders are Strong Vision Supervisors",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLVIII: 287-304",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236834"}

@inproceedings{bb241919,
        AUTHOR = "Kim, W. and Chun, S. and Kim, T. and Han, D.Y. and Yun, S.",
        TITLE = "HYPE: Hyperbolic Entailment Filtering for Underspecified Images and
Texts",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XL: 247-265",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236835"}

@inproceedings{bb241920,
        AUTHOR = "Sogi, N. and Shibata, T. and Terao, M.",
        TITLE = "Object-aware Query Perturbation for Cross-modal Image-text Retrieval",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXIX: 447-464",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236836"}

@inproceedings{bb241921,
        AUTHOR = "Alper, M. and Averbuch Elor, H.",
        TITLE = "Emergent Visual-semantic Hierarchies in Image-text Representations",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LII: 220-238",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236837"}

@inproceedings{bb241922,
        AUTHOR = "Gordon, B. and Bitton, Y. and Shafir, Y. and Garg, R. and Chen, X. and Lischinski, D. and Cohen Or, D. and Szpektor, I.",
        TITLE = "Mismatch Quest: Visual and Textual Feedback for Image-Text Misalignment",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LVII: 310-328",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236838"}

@inproceedings{bb241923,
        AUTHOR = "Hua, H. and Shi, J. and Kafle, K. and Jenni, S. and Zhang, D. and Collomosse, J. and Cohen, S. and Luo, J.B.",
        TITLE = "Finematch: Aspect-based Fine-grained Image and Text Mismatch Detection
and Correction",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "IX: 474-491",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236839"}

@inproceedings{bb241924,
        AUTHOR = "Li, Y.H. and Liu, H.T. and Cai, M. and Li, Y.J. and Shechtman, E. and Lin, Z. and Lee, Y.J. and Singh, K.K.",
        TITLE = "Removing Distributional Discrepancies in Captions Improves Image-Text
Alignment",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXI: 405-422",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236840"}

@inproceedings{bb241925,
        AUTHOR = "Ma, W. and Li, K. and Jiang, Z. and Meshry, M. and Liu, Q.H. and Wang, H.Y. and Hane, C. and Yuille, A.L.",
        TITLE = "Rethinking Video-text Understanding: Retrieval from Counterfactually
Augmented Data",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XIII: 254-269",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236841"}

@inproceedings{bb241926,
        AUTHOR = "Zhang, W. and Xu, X.W. and Tao, Y. and Wang, X.D. and Wang, C.L. and Wei, Z.M.",
        TITLE = "Bi-Directional Image-Text Retrieval With Position Attention and
Similarity Filtering",
        BOOKTITLE = ICIVC22,
        YEAR = "2022",
        PAGES = "635-640",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236842"}

@inproceedings{bb241927,
        AUTHOR = "Li, Z. and Nian, X.H. and Pan, C. and Yang, D. and Xiong, H.Y. and Wang, H.B.",
        TITLE = "Relation Graph Reasoning for Image-Text Matching",
        BOOKTITLE = ICIVC22,
        YEAR = "2022",
        PAGES = "319-324",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236843"}

@inproceedings{bb241928,
        AUTHOR = "Zhang, K. and Mao, Z.D. and Wang, Q. and Zhang, Y.D.",
        TITLE = "Negative-Aware Attention Framework for Image-Text Matching",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15640-15649",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236844"}

@inproceedings{bb241929,
        AUTHOR = "Long, S. and Han, S.C. and Wan, X.J. and Poon, J.",
        TITLE = "GraDual: Graph-based Dual-modal Representation for Image-Text
Matching",
        BOOKTITLE = WACV22,
        YEAR = "2022",
        PAGES = "2463-2472",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236845"}

@inproceedings{bb241930,
        AUTHOR = "Biten, A.F. and Mafla, A. and Gomez, L. and Karatzas, D.",
        TITLE = "Is An Image Worth Five Sentences? A New Look into Semantics for
Image-Text Matching",
        BOOKTITLE = WACV22,
        YEAR = "2022",
        PAGES = "2483-2492",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236846"}

@inproceedings{bb241931,
        AUTHOR = "Mithun, N.C. and Pasricha, R. and Papalexakis, E. and Roy Chowdhury, A.K.",
        TITLE = "Webly Supervised Image-Text Embedding with Noisy Tag Refinement",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "7454-7461",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236847"}

@inproceedings{bb241932,
        AUTHOR = "Chen, J.A. and Zhang, L. and Wang, Q. and Bai, C. and Kpalma, K.",
        TITLE = "Intra-Modal Constraint Loss for Image-Text Retrieval",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "4023-4027",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236848"}

@inproceedings{bb241933,
        AUTHOR = "Liu, Y. and Wang, H.Q. and Meng, F.Y. and Liu, M.Y. and Liu, H.",
        TITLE = "Attend, Correct and Focus: A Bidirectional Correct Attention Network
for Image-Text Matching",
        BOOKTITLE = ICIP21,
        YEAR = "2021",
        PAGES = "2673-2677",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236849"}

@inproceedings{bb241934,
        AUTHOR = "Yang, S.T. and Huang, K.H. and Howe, B.",
        TITLE = "JECL: Joint Embedding and Cluster Learning for Image-Text Pairs",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "8344-8351",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236850"}

@inproceedings{bb241935,
        AUTHOR = "Mikriukov, G. and Ravanbakhsh, M. and Demir, B.",
        TITLE = "An Unsupervised Cross-Modal Hashing Method Robust to Noisy Training
Image-Text Correspondences in Remote Sensing",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "2556-2560",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236851"}

@inproceedings{bb241936,
        AUTHOR = "Anwaar, M.U. and Labintcev, E. and Kleinsteuber, M.",
        TITLE = "Compositional Learning of Image-Text Query for Image Retrieval",
        BOOKTITLE = WACV21,
        YEAR = "2021",
        PAGES = "1139-1148",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236852"}

@inproceedings{bb241937,
        AUTHOR = "Messina, N. and Falchi, F. and Esuli, A. and Amato, G.",
        TITLE = "Transformer Reasoning Network for Image-Text Matching and Retrieval",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "5222-5229",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236853"}

@inproceedings{bb241938,
        AUTHOR = "Zhang, Q. and Lei, Z. and Zhang, Z.X. and Li, S.Z.",
        TITLE = "Context-Aware Attention Network for Image-Text Retrieval",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "3533-3542",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236854"}

@inproceedings{bb241939,
        AUTHOR = "Chen, Y.C. and Li, L.J. and Yu, L.C. and El Kholy, A. and Ahmed, F. and Gan, Z. and Cheng, Y. and Liu, J.J.",
        TITLE = "Uniter: Universal Image-Text Representation Learning",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XXX: 104-120",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236855"}

@inproceedings{bb241940,
        AUTHOR = "Wang, H.R. and Zhang, Y. and Ji, Z. and Pang, Y.W. and Ma, L.",
        TITLE = "Consensus-aware Visual-semantic Embedding for Image-Text Matching",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XXIV:18-34",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236856"}

@inproceedings{bb241941,
        AUTHOR = "Chen, T.L. and Deng, J.J. and Luo, J.B.",
        TITLE = "Adaptive Offline Quintuplet Loss for Image-text Matching",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XIII:549-565",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236857"}

@inproceedings{bb241942,
        AUTHOR = "Lee, K.H. and Chen, X. and Hua, G. and Hu, H.D. and He, X.D.",
        TITLE = "Stacked Cross Attention for Image-Text Matching",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "II: 212-228",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236858"}

@inproceedings{bb241943,
        AUTHOR = "Plummer, B.A. and Kordas, P. and Kiapour, M.H. and Zheng, S. and Piramuthu, R. and Lazebnik, S.",
        TITLE = "Conditional Image-Text Embedding Networks",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "XII: 258-274",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT236859"}

@article{bb241944,
        AUTHOR = "Yang, Z.Y. and Kumar, T. and Chen, T.L. and Su, J.S. and Luo, J.B.",
        TITLE = "Grounding-Tracking-Integration",
        JOURNAL = CirSysVideo,
        VOLUME = "31",
        YEAR = "2021",
        NUMBER = "9",
        MONTH = "September",
        PAGES = "3433-3443",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236860"}

@article{bb241945,
        AUTHOR = "Zhang, W.X. and Ma, C. and Wu, Q. and Yang, X.K.",
        TITLE = "Language-Guided Navigation via Cross-Modal Grounding and Alternate
Adversarial Learning",
        JOURNAL = CirSysVideo,
        VOLUME = "31",
        YEAR = "2021",
        NUMBER = "9",
        MONTH = "September",
        PAGES = "3469-3481",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236861"}

@article{bb241946,
        AUTHOR = "Zhai, S.L. and Guo, G.B. and Yuan, F.J. and Liu, Y. and Wang, X.W.",
        TITLE = "VSE-fs: Fast Full-Sample Visual Semantic Embedding",
        JOURNAL = IEEE_Int_Sys,
        VOLUME = "36",
        YEAR = "2021",
        NUMBER = "4",
        MONTH = "July",
        PAGES = "3-12",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236862"}

@article{bb241947,
        AUTHOR = "Bargal, S.A. and Zunino, A. and Petsiuk, V. and Zhang, J.M. and Saenko, K. and Murino, V. and Sclaroff, S.",
        TITLE = "Guided Zoom: Zooming into Network Evidence to Refine Fine-Grained
Model Decisions",
        JOURNAL = PAMI,
        VOLUME = "43",
        YEAR = "2021",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "4196-4202",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236863"}

@article{bb241948,
        AUTHOR = "Hong, R.C. and Liu, D. and Mo, X.Y. and He, X.N. and Zhang, H.W.",
        TITLE = "Learning to Compose and Reason with Language Tree Structures for
Visual Grounding",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "684-696",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236864"}

@inproceedings{bb241949,
        AUTHOR = "Tang, K.H. and Zhang, H.W. and Wu, B.Y. and Luo, W.H. and Liu, W.",
        TITLE = "Learning to Compose Dynamic Tree Structures for Visual Contexts",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "6612-6621",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236865"}

@article{bb241950,
        AUTHOR = "Bin, Y. and Ding, Y.J. and Peng, B. and Peng, L. and Yang, Y. and Chua, T.S.",
        TITLE = "Entity Slot Filling for Visual Captioning",
        JOURNAL = CirSysVideo,
        VOLUME = "32",
        YEAR = "2022",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "52-62",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236866"}

@article{bb241951,
        AUTHOR = "Chu, C. and Oliveira, V. and Virgo, F.G. and Otani, M. and Garcia, N. and Nakashima, Y.",
        TITLE = "The semantic typology of visually grounded paraphrases",
        JOURNAL = CVIU,
        VOLUME = "215",
        YEAR = "2022",
        PAGES = "103333",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236867"}

@article{bb241952,
        AUTHOR = "Deng, C.R. and Wu, Q. and Wu, Q.Y. and Hu, F.Y. and Lyu, F. and Tan, M.K.",
        TITLE = "Visual Grounding Via Accumulated Attention",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "1670-1684",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236868"}

@inproceedings{bb241953,
        AUTHOR = "Tan, M.K. and Lyu, F. and Hu, F.Y. and Wu, Q.Y. and Wu, Q. and Deng, C.R.",
        TITLE = "Visual Grounding Via Accumulated Attention",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "7746-7755",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236868"}

@article{bb241954,
        AUTHOR = "Yu, X.T. and Zhang, H.M. and Hong, R.X. and Song, Y.Q. and Zhang, C.S.",
        TITLE = "VD-PCR: Improving visual dialog with pronoun coreference resolution",
        JOURNAL = PR,
        VOLUME = "125",
        YEAR = "2022",
        PAGES = "108540",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236869"}

@article{bb241955,
        AUTHOR = "Yuan, Y.T. and Ma, L. and Wang, J.W. and Liu, W. and Zhu, W.W.",
        TITLE = "Semantic Conditioned Dynamic Modulation for Temporal Sentence
Grounding in Videos",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "2725-2741",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236870"}

@article{bb241956,
        AUTHOR = "He, S. and Yang, X.F. and Lin, G.S.",
        TITLE = "Learning language to symbol and language to vision mapping for visual
grounding",
        JOURNAL = IVC,
        VOLUME = "122",
        YEAR = "2022",
        PAGES = "104451",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236871"}

@article{bb241957,
        AUTHOR = "Jiang, W.H. and Zhu, M. and Fang, Y.M. and Shi, G.M. and Zhao, X.W. and Liu, Y.",
        TITLE = "Visual Cluster Grounding for Image Captioning",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "3920-3934",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236872"}

@article{bb241958,
        AUTHOR = "Liao, Y. and Zhang, A. and Chen, Z.Y. and Hui, T.R. and Liu, S.",
        TITLE = "Progressive Language-Customized Visual Feature Learning for One-Stage
Visual Grounding",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "4266-4277",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236873"}

@article{bb241959,
        AUTHOR = "Ding, X.P. and Wang, N.N. and Zhang, S.W. and Huang, Z.Y. and Li, X.M. and Tang, M.Q. and Liu, T.L. and Gao, X.B.",
        TITLE = "Exploring Language Hierarchy for Video Grounding",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "4693-4706",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236874"}

@article{bb241960,
        AUTHOR = "Xu, Z. and Chen, D. and Wei, K. and Deng, C. and Xue, H.",
        TITLE = "HiSA: Hierarchically Semantic Associating for Video Temporal
Grounding",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "5178-5188",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236875"}

@article{bb241961,
        AUTHOR = "Gao, J.L. and Sun, X. and Ghanem, B. and Zhou, X. and Ge, S.M.",
        TITLE = "Efficient Video Grounding With Which-Where Reading Comprehension",
        JOURNAL = CirSysVideo,
        VOLUME = "32",
        YEAR = "2022",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "6900-6913",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236876"}

@article{bb241962,
        AUTHOR = "Zhou, H. and Zhang, C.Y. and Luo, Y. and Hu, C.P. and Zhang, W.J.",
        TITLE = "Thinking Inside Uncertainty: Interest Moment Perception for Diverse
Temporal Grounding",
        JOURNAL = CirSysVideo,
        VOLUME = "32",
        YEAR = "2022",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "7190-7203",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236877"}

@article{bb241963,
        AUTHOR = "Tang, Z.H. and Liao, Y. and Liu, S. and Li, G.B. and Jin, X.J. and Jiang, H.X. and Yu, Q. and Xu, D.",
        TITLE = "Human-Centric Spatio-Temporal Video Grounding With Visual
Transformers",
        JOURNAL = CirSysVideo,
        VOLUME = "32",
        YEAR = "2022",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "8238-8249",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236878"}

@article{bb241964,
        AUTHOR = "Wang, W. and Gao, J.Y. and Xu, C.S.",
        TITLE = "Weakly-Supervised Video Object Grounding via Causal Intervention",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "3933-3948",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236879"}

@article{bb241965,
        AUTHOR = "Wang, W. and Gao, J.Y. and Xu, C.S.",
        TITLE = "Weakly-Supervised Video Object Grounding via Learning Uni-Modal
Associations",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "6329-6340",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236880"}

@article{bb241966,
        AUTHOR = "Nayyeri, M. and Xu, C.J. and Alam, M.M. and Lehmann, J. and Yazdi, H.S.",
        TITLE = "LogicENN: A Neural Based Knowledge Graphs Embedding Model With
Logical Rules",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "7050-7062",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236881"}

@article{bb241967,
        AUTHOR = "Chen, T.B. and Wang, W. and Han, K. and Xu, H.J.",
        TITLE = "SaGCN: Semantic-Aware Graph Calibration Network for Temporal Sentence
Grounding",
        JOURNAL = CirSysVideo,
        VOLUME = "33",
        YEAR = "2023",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "3003-3016",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236882"}

@article{bb241968,
        AUTHOR = "Zhang, H. and Sun, A. and Jing, W. and Zhou, J.T.Y.",
        TITLE = "Temporal Sentence Grounding in Videos: A Survey and Future Directions",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "10443-10465",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236883"}

@article{bb241969,
        AUTHOR = "Deng, J.J. and Yang, Z.Y. and Liu, D. and Chen, T.L. and Zhou, W.G. and Zhang, Y. and Li, H.Q. and Ouyang, W.L.",
        TITLE = "TransVG++: End-to-End Visual Grounding With Language Conditioned
Vision Transformer",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "13636-13652",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236884"}

@inproceedings{bb241970,
        AUTHOR = "Deng, J.J. and Yang, Z.Y. and Chen, T.L. and Zhou, W.G. and Li, H.Q.",
        TITLE = "TransVG: End-to-End Visual Grounding with Transformers",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1749-1759",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236885"}

@article{bb241971,
        AUTHOR = "Li, J.C. and Tang, S.L. and Zhu, L.C. and Zhang, W.Q. and Yang, Y. and Chua, T.S. and Wu, F. and Zhuang, Y.T.",
        TITLE = "Variational Cross-Graph Reasoning and Adaptive Structured Semantics
Learning for Compositional Temporal Grounding",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "12601-12617",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236886"}

@inproceedings{bb241972,
        AUTHOR = "Li, J.C. and Xie, J.L. and Qian, L. and Zhu, L.C. and Tang, S.L. and Wu, F. and Yang, Y. and Zhuang, Y.T. and Wang, X.E.",
        TITLE = "Compositional Temporal Grounding with Structured Variational
Cross-Graph Correspondence Learning",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "3022-3031",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236887"}

@article{bb241973,
        AUTHOR = "Gonzalez, C. and Ayobi, N. and Hernandez, I. and Pont Tuset, J. and Arbelaez, P.",
        TITLE = "PiGLET:
Pixel-Level Grounding of Language Expressions With Transformers",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "12206-12221",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236888"}

@article{bb241974,
        AUTHOR = "Zhang, R.S. and Wang, C. and Liu, C.L.",
        TITLE = "Cycle-Consistent Weakly Supervised Visual Grounding With Individual
and Contextual Representations",
        JOURNAL = IP,
        VOLUME = "32",
        YEAR = "2023",
        PAGES = "5167-5180",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236889"}

@article{bb241975,
        AUTHOR = "Wang, Y. and Su, Y.T. and Li, W.H. and Xiao, J. and Li, X.Y. and Liu, A.A.",
        TITLE = "Dual-Path Rare Content Enhancement Network for Image and Text
Matching",
        JOURNAL = CirSysVideo,
        VOLUME = "33",
        YEAR = "2023",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "6144-6158",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236890"}

@article{bb241976,
        AUTHOR = "Xu, Z. and Wei, K. and Yang, X. and Deng, C.",
        TITLE = "Point-Supervised Video Temporal Grounding",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "6121-6131",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236891"}

@article{bb241977,
        AUTHOR = "Luo, F. and Chen, S.X. and Chen, J.J. and Wu, Z.X. and Jiang, Y.G.",
        TITLE = "Self-Supervised Learning for Semi-Supervised Temporal Language
Grounding",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "7747-7757",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236892"}

@article{bb241978,
        AUTHOR = "Liu, D.Z. and Fang, X. and Hu, W. and Zhou, P.",
        TITLE = "Exploring Optical-Flow-Guided Motion and Detection-Based Appearance
for Temporal Sentence Grounding",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "8539-8553",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236893"}

@article{bb241979,
        AUTHOR = "Yang, X.F. and Liu, F. and Lin, G.S.",
        TITLE = "Effective End-to-End Vision Language Pretraining With Semantic Visual
Loss",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "8408-8417",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236894"}

@article{bb241980,
        AUTHOR = "Ma, G.Q. and Bai, Y. and Zhang, W. and Yao, T. and Shihada, B. and Mei, T.",
        TITLE = "Boosting Generic Visual-Linguistic Representation With Dynamic
Contexts",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "8445-8457",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236895"}

@article{bb241981,
        AUTHOR = "Su, C. and Li, Z. and Lei, T.Y. and Peng, D.Z. and Wang, X.",
        TITLE = "MetaVG: A Meta-Learning Framework for Visual Grounding",
        JOURNAL = SPLetters,
        VOLUME = "31",
        YEAR = "2024",
        PAGES = "236-240",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236896"}

@article{bb241982,
        AUTHOR = "Fang, X. and Liu, D. and Zhou, P. and Xu, Z.C. and Li, R.X.",
        TITLE = "Hierarchical Local-Global Transformer for Temporal Sentence Grounding",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "3263-3277",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236897"}

@article{bb241983,
        AUTHOR = "Wang, Z.Y. and Yang, C. and Jiang, B. and Yuan, J.S.",
        TITLE = "A Dual Reinforcement Learning Framework for Weakly Supervised Phrase
Grounding",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "394-405",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236898"}

@article{bb241984,
        AUTHOR = "Lu, Y. and Quan, R.J. and Zhu, L.C. and Yang, Y.",
        TITLE = "Zero-Shot Video Grounding With Pseudo Query Lookup and Verification",
        JOURNAL = IP,
        VOLUME = "33",
        YEAR = "2024",
        PAGES = "1643-1654",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236899"}

@article{bb241985,
        AUTHOR = "Wang, W.K. and Su, Y.T. and Liu, J. and Jing, P.G.",
        TITLE = "Adaptive proposal network based on generative adversarial learning
for weakly supervised temporal sentence grounding",
        JOURNAL = PRL,
        VOLUME = "179",
        YEAR = "2024",
        PAGES = "9-16",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236900"}

@article{bb241986,
        AUTHOR = "Liu, M. and Zhou, D. and Guo, J. and Luo, X. and Gao, Z. and Nie, L.Q.",
        TITLE = "Semantic-Aware Contrastive Learning With Proposal Suppression for
Video Semantic Role Grounding",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "3003-3016",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236901"}

@article{bb241987,
        AUTHOR = "Tang, W. and Li, L. and Liu, X.J. and Jin, L. and Tang, J.H. and Li, Z.C.",
        TITLE = "Context Disentangling and Prototype Inheriting for Robust Visual
Grounding",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "3213-3229",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236902"}

@article{bb241988,
        AUTHOR = "Shi, F.Y. and Huang, W.L. and Wang, L.M.",
        TITLE = "End-to-end dense video grounding via parallel regression",
        JOURNAL = CVIU,
        VOLUME = "242",
        YEAR = "2024",
        PAGES = "103980",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236903"}

@article{bb241989,
        AUTHOR = "Shao, R. and Wu, T.X. and Wu, J.L. and Nie, L.Q. and Liu, Z.W.",
        TITLE = "Detecting and Grounding Multi-Modal Media Manipulation and Beyond",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "5556-5574",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236904"}

@inproceedings{bb241990,
        AUTHOR = "Shao, R. and Wu, T.X. and Liu, Z.W.",
        TITLE = "Detecting and Grounding Multi-Modal Media Manipulation",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6904-6913",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236905"}

@article{bb241991,
        AUTHOR = "Chen, L. and Deng, Z. and Liu, L.B. and Yin, S.",
        TITLE = "Multilevel Semantic Interaction Alignment for Video-Text Cross-Modal
Retrieval",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "7",
        MONTH = "July",
        PAGES = "6559-6575",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236906"}

@article{bb241992,
        AUTHOR = "Wu, Q.Q. and Guo, L.J. and Zhang, R. and Qian, J.B. and Gao, S.",
        TITLE = "QSMT-net: A query-sensitive proposal and multi-temporal-span matching
network for video grounding",
        JOURNAL = IVC,
        VOLUME = "149",
        YEAR = "2024",
        PAGES = "105188",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236907"}

@article{bb241993,
        AUTHOR = "Wu, W. and Cao, M. and Hu, Y. and Peng, Y. and Qin, L. and Yin, Q.",
        TITLE = "Visual Grounding With Dual Knowledge Distillation",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "10399-10410",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236908"}

@article{bb241994,
        AUTHOR = "Li, S.T. and Li, B. and Sun, B. and Weng, Y.X.",
        TITLE = "Towards Visual-Prompt Temporal Answer Grounding in Instructional
Video",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "8836-8853",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236909"}

@inproceedings{bb241995,
        AUTHOR = "Fang, X. and Xiong, Z. and Fang, W.L. and Qu, X.Y. and Chen, C. and Dongd, J.F. and Tang, K. and Zhou, P. and Cheng, Y. and Liu, D.Z.",
        TITLE = "Rethinking Weakly-supervised Video Temporal Grounding From a Game
Perspective",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLV: 290-311",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236910"}

@article{bb241996,
        AUTHOR = "Xiong, Z. and Liu, D.Z. and Fang, X. and Qu, X.Y. and Dong, J.F. and Zhu, J.H. and Tang, K. and Zhou, P.",
        TITLE = "Rethinking Video Sentence Grounding from a Tracking Perspective With
Memory Network and Masked Attention",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "11204-11218",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236911"}

@article{bb241997,
        AUTHOR = "Qi, Z.B. and Yuan, Y. and Ruan, X.W. and Wang, S.H. and Zhang, W.G. and Huang, Q.M.",
        TITLE = "Collaborative Debias Strategy for Temporal Sentence Grounding in
Video",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "10972-10986",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236912"}

@article{bb241998,
        AUTHOR = "Dong, J.X. and Yin, Z.Z.",
        TITLE = "Graph-based Dense Event Grounding with relative positional encoding",
        JOURNAL = CVIU,
        VOLUME = "251",
        YEAR = "2025",
        PAGES = "104257",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236913"}

@article{bb241999,
        AUTHOR = "Tang, K.F. and He, L.H. and Wang, N.N. and Gao, X.B.",
        TITLE = "Dual Semantic Reconstruction Network for Weakly Supervised Temporal
Sentence Grounding",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "95-107",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236914"}

Last update:Mar 28, 2026 at 17:09:41