@inproceedings{bb280800,
        AUTHOR = "Galland, L. and Pelachaud, C. and Pecune, F.",
        TITLE = "Seeing and Hearing What Has Not Been Said: A multimodal client
behavior classifier in Motivational Interviewing with interpretable
fusion",
        BOOKTITLE = FG24,
        YEAR = "2024",
        PAGES = "1-9",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275500"}

@inproceedings{bb280801,
        AUTHOR = "Praveen, R.G. and Alam, J.",
        TITLE = "Audio-Visual Person Verification Based on Recursive Fusion of Joint
Cross-Attention",
        BOOKTITLE = FG24,
        YEAR = "2024",
        PAGES = "1-5",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275501"}

@inproceedings{bb280802,
        AUTHOR = "Praveen, R.G. and Alam, J.",
        TITLE = "Dynamic Cross Attention for Audio-Visual Person Verification",
        BOOKTITLE = FG24,
        YEAR = "2024",
        PAGES = "1-5",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275502"}

@inproceedings{bb280803,
        AUTHOR = "He, Y.H. and Shin, S. and Cherian, A. and Trigoni, N. and Markham, A.",
        TITLE = "Sound3DVDet: 3D Sound Source Detection using Multiview Microphone
Array and RGB Images",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5484-5495",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275503"}

@inproceedings{bb280804,
        AUTHOR = "Ghaleb, E. and Burenko, I. and Rasenberg, M. and Pouw, W. and Uhrig, P. and Holler, J. and Toni, I. and Ozyurek, A. and Fernandez, R.",
        TITLE = "Co-Speech Gesture Detection through Multi-Phase Sequence Labeling",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "3995-4003",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275504"}

@inproceedings{bb280805,
        AUTHOR = "Liu, J.X. and Wang, Y. and Ju, C. and Ma, C.F. and Zhang, Y. and Xie, W.",
        TITLE = "Annotation-free Audio-Visual Segmentation",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5592-5602",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275505"}

@inproceedings{bb280806,
        AUTHOR = "Xu, Y.T. and Hu, C.H. and Lee, G.H.",
        TITLE = "Rethink Cross-Modal Fusion in Weakly-Supervised Audio-Visual Video
Parsing",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5603-5612",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275506"}

@inproceedings{bb280807,
        AUTHOR = "Rachavarapu, K.K. and Ramakrishnan, K. and Rajagopalan, A. N.",
        TITLE = "Weakly-Supervised Audio-Visual Video Parsing with Prototype-Based
Pseudo-Labeling",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18952-18962",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275507"}

@inproceedings{bb280808,
        AUTHOR = "Rachavarapu, K.K. and Rajagopalan, A.N.",
        TITLE = "Boosting Positive Segments for Weakly-Supervised Audio-Visual Video
Parsing",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "10158-10168",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275508"}

@inproceedings{bb280809,
        AUTHOR = "Chen, J. and Wang, W.G. and Liu, S. and Li, H.S. and Yang, Y.",
        TITLE = "Omnidirectional Information Gathering for Knowledge Transfer-based
Audio-Visual Navigation",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "10959-10969",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275509"}

@inproceedings{bb280810,
        AUTHOR = "Cheng, X.Z. and Jin, T. and Huang, R.J. and Li, L.J. and Lin, W. and Wang, Z. and Wang, Y. and Liu, H.D. and Yin, A.X. and Zhao, Z.",
        TITLE = "MixSpeech: Cross-Modality Self-Learning with Audio-Visual Stream
Mixup for Visual Speech Translation and Recognition",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15689-15699",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275510"}

@inproceedings{bb280811,
        AUTHOR = "Georgescu, M.I. and Fonseca, E. and Ionescu, R.T. and Lucic, M. and Schmid, C. and Arnab, A.",
        TITLE = "Audiovisual Masked Autoencoders",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "16098-16108",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275511"}

@inproceedings{bb280812,
        AUTHOR = "Chen, M.F. and Su, K. and Shlizerman, E.",
        TITLE = "Be Everywhere - Hear Everything (BEE): Audio Scene Reconstruction by
Sparse Audio-Visual Samples",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "7819-7828",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275512"}

@inproceedings{bb280813,
        AUTHOR = "Xie, H.X. and Lee, M.X. and Chen, T.J. and Chen, H.J. and Liu, H.I. and Shuai, H.H. and Cheng, W.H.",
        TITLE = "Most Important Person-guided Dual-branch Cross-Patch Attention for
Group Affect Recognition",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "20541-20551",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275513"}

@inproceedings{bb280814,
        AUTHOR = "Djilali, Y.A.D. and Narayan, S. and Boussaid, H. and Almazrouei, E. and Debbah, M.",
        TITLE = "Lip2Vec: Efficient and Robust Visual Speech Recognition via
Latent-to-Latent Visual to Audio Representation Mapping",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13744-13755",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275514"}

@inproceedings{bb280815,
        AUTHOR = "Chen, G.Y. and Zhang, D. and Liu, T. and Du, X.Y.",
        TITLE = "Local-Global Contrast for Learning Voice-Face Representations",
        BOOKTITLE = ICIP23,
        YEAR = "2023",
        PAGES = "51-55",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275515"}

@inproceedings{bb280816,
        AUTHOR = "Hong, J. and Kim, M. and Choi, J. and Ro, Y.M.",
        TITLE = "Watch or Listen: Robust Audio-Visual Speech Recognition with Visual
Corruption Modeling and Reliability Scoring",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "18783-18794",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275516"}

@inproceedings{bb280817,
        AUTHOR = "Porgali, B. and Albiero, V. and Ryda, J. and Ferrer, C.C. and Hazirbas, C.",
        TITLE = "The Casual Conversations v2 Dataset: A diverse, large benchmark for
measuring fairness and robustness in audio/vision/speech models",
        BOOKTITLE = FaDE-TCV23,
        YEAR = "2023",
        PAGES = "10-17",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275517"}

@inproceedings{bb280818,
        AUTHOR = "Xiong, J.W. and Wang, G. and Zhang, P. and Huang, W. and Zha, Y.F. and Zhai, G.T.",
        TITLE = "CASP-Net: Rethinking Video Saliency Prediction from an Audio-Visual
Consistency Perceptual Perspective",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6441-6450",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275518"}

@inproceedings{bb280819,
        AUTHOR = "Liao, J.H. and Duan, H.H. and Feng, K.H. and Zhao, W.B. and Yang, Y.B. and Chen, L.Y.",
        TITLE = "A Light Weight Model for Active Speaker Detection",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "22932-22941",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275519"}

@inproceedings{bb280820,
        AUTHOR = "Seo, P.H. and Nagrani, A. and Schmid, C.",
        TITLE = "AVFormer: Injecting Vision into Frozen Speech Models for Zero-Shot
AV-ASR",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "22922-22931",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275520"}

@inproceedings{bb280821,
        AUTHOR = "Feng, D. and Yang, S. and Shan, S.G. and Chen, X.L.",
        TITLE = "Audio-Driven Deformation Flow for Effective Lip Reading",
        BOOKTITLE = "ICPR22",
        YEAR = "2022",
        PAGES = "274-280",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275521"}

@inproceedings{bb280822,
        AUTHOR = "Varshney, M. and Yadav, R. and Namboodiri, V.P. and Hegde, R.M.",
        TITLE = "Learning Speaker-specific Lip-to-Speech Generation",
        BOOKTITLE = "ICPR22",
        YEAR = "2022",
        PAGES = "491-498",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275522"}

@inproceedings{bb280823,
        AUTHOR = "Shi, C. and Yang, S.",
        TITLE = "Spatial and Visual Perspective-Taking via View Rotation and Relation
Reasoning for Embodied Reference Understanding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:201-218",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275523"}

@inproceedings{bb280824,
        AUTHOR = "Hayes, T. and Zhang, S.Y. and Yin, X. and Pang, G. and Sheng, S. and Yang, H. and Ge, S.W. and Hu, Q.Y. and Parikh, D.",
        TITLE = "MUGEN: A Playground for Video-Audio-Text Multimodal Understanding and
GENeration",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "VIII:431-449",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275524"}

@inproceedings{bb280825,
        AUTHOR = "van Horn, G. and Qian, R. and Wilber, K. and Adam, H. and Aodha, O.M. and Belongie, S.",
        TITLE = "Exploring Fine-Grained Audiovisual Categorization with the SSW60
Dataset",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "VIII:271-289",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275525"}

@inproceedings{bb280826,
        AUTHOR = "Yu, S. and Wu, P. and Liang, P.P. and Salakhutdinov, R. and Morency, L.P.",
        TITLE = "PACS: A Dataset for Physical Audiovisual CommonSense Reasoning",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:292-309",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275526"}

@inproceedings{bb280827,
        AUTHOR = "Cheng, H.Y. and Liu, Z.Y. and Zhou, H. and Qian, C. and Wu, W. and Wang, L.M.",
        TITLE = "Joint-Modal Label Denoising for Weakly-Supervised Audio-Visual Video
Parsing",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXIV:431-448",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275527"}

@inproceedings{bb280828,
        AUTHOR = "Zhang, Z.Q. and Zhang, J. and Zhang, J.S. and Wu, M.H. and Fang, X. and Dai, L.R.",
        TITLE = "Learning Contextually Fused Audio-Visual Representations for
Audio-Visual Speech Recognition",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "1346-1350",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275528"}

@inproceedings{bb280829,
        AUTHOR = "Montesinos, J.F. and Kadandale, V.S. and Haro, G.",
        TITLE = "VoViT: Low Latency Graph-Based Audio-Visual Voice Separation
Transformer",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:310-326",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275529"}

@inproceedings{bb280830,
        AUTHOR = "Tzinis, E. and Wisdom, S. and Remez, T. and Hershey, J.R.",
        TITLE = "AudioScopeV2: Audio-Visual Attention Architectures for Calibrated
Open-Domain On-Screen Sound Separation",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:368-385",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275530"}

@inproceedings{bb280831,
        AUTHOR = "Zhou, J.X. and Wang, J.Y. and Zhang, J.Y. and Sun, W.X. and Zhang, J. and Birchfield, S. and Guo, D. and Kong, L.P. and Wang, M. and Zhong, Y.R.",
        TITLE = "Audio-Visual Segmentation",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:386-403",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275531"}

@inproceedings{bb280832,
        AUTHOR = "Alcazar, J.L. and Cordes, M. and Zhao, C. and Ghanem, B.",
        TITLE = "End-to-End Active Speaker Detection",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:126-143",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275532"}

@inproceedings{bb280833,
        AUTHOR = "Chen, C.G. and Gao, R.H. and Calamia, P. and Grauman, K.",
        TITLE = "Visual Acoustic Matching",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "18836-18846",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275533"}

@inproceedings{bb280834,
        AUTHOR = "Lee, S. and Kim, H.I. and Ro, Y.M.",
        TITLE = "Weakly Paired Associative Learning for Sound and Image
Representations via Bimodal Associative Memory",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "10524-10533",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275534"}

@inproceedings{bb280835,
        AUTHOR = "Vasudevan, A.B. and Dai, D.X. and Van Gool, L.J.",
        TITLE = "Sound and Visual Representation Learning with Multiple Pretraining
Tasks",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "14596-14606",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275535"}

@inproceedings{bb280836,
        AUTHOR = "Ng, E. and Joo, H. and Hu, L.W. and Li, H. and Darrell, T.J. and Kanazawa, A. and Ginosar, S.",
        TITLE = "Learning to Listen: Modeling Non-Deterministic Dyadic Facial Motion",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "20363-20373",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275536"}

@inproceedings{bb280837,
        AUTHOR = "Kurzendorfer, D. and Mercea, O.B. and Koepke, A.S. and Akata, Z.",
        TITLE = "Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large
Multi-Modal Models",
        BOOKTITLE = L3D-IVU24,
        YEAR = "2024",
        PAGES = "2627-2638",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275537"}

@inproceedings{bb280838,
        AUTHOR = "Mercea, O.B. and Hummel, T. and Koepke, A.S. and Akata, Z.",
        TITLE = "Temporal and Cross-modal Attention for Audio-Visual Zero-Shot Learning",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XX:488-505",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275538"}

@inproceedings{bb280839,
        AUTHOR = "Mercea, O.B. and Riesch, L. and Koepke, A.S. and Akata, Z.",
        TITLE = "Audiovisual Generalised Zero-shot Learning with Cross-modal Attention
and Language",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "10543-10553",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275539"}

@inproceedings{bb280840,
        AUTHOR = "Karas, V. and Tellamekala, M.K. and Mallol Ragolta, A. and Valstar, M. and Schuller, B.W.",
        TITLE = "Time-Continuous Audiovisual Fusion with Recurrence vs Attention for
In-The-Wild Affect Recognition",
        BOOKTITLE = ABAW22,
        YEAR = "2022",
        PAGES = "2381-2390",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275540"}

@inproceedings{bb280841,
        AUTHOR = "Yang, K. and Markovic, D. and Krenn, S. and Agrawal, V. and Richard, A.",
        TITLE = "Audio-Visual Speech Codecs: Rethinking Audio-Visual Speech
Enhancement by Re-Synthesis",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "8217-8227",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275541"}

@inproceedings{bb280842,
        AUTHOR = "Kim, M. and Hong, J. and Park, S.J. and Ro, Y.M.",
        TITLE = "Multi-modality Associative Bridging through Memory:
Speech Sound Recollected from Face Video",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "296-306",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275542"}

@inproceedings{bb280843,
        AUTHOR = "Li, J. and Kang, D. and Pei, W.J. and Zhe, X.F. and Zhang, Y. and He, Z.Y. and Bao, L.C.",
        TITLE = "Audio2Gestures: Generating Diverse Gestures from Speech Audio with
Conditional Variational Autoencoders",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "11273-11282",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275543"}

@inproceedings{bb280844,
        AUTHOR = "Ye, M. and You, Q.Z. and Ma, F.L.",
        TITLE = "QUALIFIER: Question-Guided Self-Attentive Multimodal Fusion Network
for Audio Visual Scene-Aware Dialog",
        BOOKTITLE = WACV22,
        YEAR = "2022",
        PAGES = "2503-2511",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275544"}

@inproceedings{bb280845,
        AUTHOR = "Yao, S. and Min, X.K. and Zhai, G.T.",
        TITLE = "Deep Audio-Visual Fusion Neural Network for Saliency Estimation",
        BOOKTITLE = ICIP21,
        YEAR = "2021",
        PAGES = "1604-1608",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275545"}

@inproceedings{bb280846,
        AUTHOR = "Krishnamurthy, S.",
        TITLE = "Learning Self-supervised Audio-Visual Representations for Sound
Recommendations",
        BOOKTITLE = ISVC21,
        YEAR = "2021",
        PAGES = "II:124-138",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275546"}

@inproceedings{bb280847,
        AUTHOR = "Shi, W.J. and Pattichis, M.S. and Celedon Pattichis, S. and LopezLeiva, C.",
        TITLE = "Talking Detection in Collaborative Learning Environments",
        BOOKTITLE = CAIP21,
        YEAR = "2021",
        PAGES = "II:242-251",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275547"}

@inproceedings{bb280848,
        AUTHOR = "Wang, G. and Chen, C.L.Z. and Fan, D.P. and Hao, A. and Qin, H.",
        TITLE = "From Semantic Categories to Fixations: A Novel Weakly-supervised
Visual-auditory Saliency Detection Approach",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "15114-15123",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275548"}

@inproceedings{bb280849,
        AUTHOR = "Wen, P.S. and Xu, Q.Q. and Jiang, Y.B.Y. and Yang, Z.Y. and He, Y. and Huang, Q.M.",
        TITLE = "Seeking the Shape of Sound:
An Adaptive Framework for Learning Voice-Face Association",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "16342-16351",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275549"}

@inproceedings{bb280850,
        AUTHOR = "Monfort, M. and Jin, S. and Liu, A. and Harwath, D. and Feris, R.S. and Glass, J. and Oliva, A.",
        TITLE = "Spoken Moments: Learning Joint Audio-Visual Representations from
Video Descriptions",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "14866-14876",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275550"}

@inproceedings{bb280851,
        AUTHOR = "Tian, Y.P. and Xu, C.L.",
        TITLE = "Can audio-visual integration strengthen robustness under multimodal
attacks?",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "5597-5607",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275551"}

@inproceedings{bb280852,
        AUTHOR = "Morgado, P. and Vasconcelos, N.M. and Misra, I.",
        TITLE = "Audio-Visual Instance Discrimination with Cross-Modal Agreement",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "12470-12481",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275552"}

@inproceedings{bb280853,
        AUTHOR = "Morgado, P. and Misra, I. and Vasconcelos, N.M.",
        TITLE = "Robust Audio-Visual Instance Discrimination",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "12929-12940",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275553"}

@inproceedings{bb280854,
        AUTHOR = "Chen, Y.B. and Xian, Y.Q. and Koepke, A.S. and Shan, Y. and Akata, Z.",
        TITLE = "Distilling Audio-Visual Knowledge by Compositional Contrastive
Learning",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "7012-7021",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275554"}

@inproceedings{bb280855,
        AUTHOR = "Zhang, Z.M. and Li, L.C. and Ding, Y. and Fan, C.J.",
        TITLE = "Flow-guided One-shot Talking Face Generation with a High-resolution
Audio-visual Dataset",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "3660-3669",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275555"}

@inproceedings{bb280856,
        AUTHOR = "Gao, R.H. and Grauman, K.",
        TITLE = "VisualVoice: Audio-Visual Speech Separation with Cross-Modal
Consistency",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "15490-15500",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275556"}

@inproceedings{bb280857,
        AUTHOR = "Mazumder, P. and Sing, P. and Parida, K.K. and Namboodiri, V.P.",
        TITLE = "AVGZSLNet: Audio-Visual Generalized Zero-Shot Learning by
Reconstructing Label Features from Multi-Modal Embeddings",
        BOOKTITLE = WACV21,
        YEAR = "2021",
        PAGES = "3089-3098",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275557"}

@inproceedings{bb280858,
        AUTHOR = "Ishikawa, R. and Hachiuma, R. and Kurobe, A. and Saito, H.",
        TITLE = "Single-modal Incremental Terrain Clustering from Self-Supervised
Audio-Visual Feature Learning",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "9399-9406",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275558"}

@inproceedings{bb280859,
        AUTHOR = "Madrigal, F. and Lerasle, F. and Pibre, L. and Ferrane, I.",
        TITLE = "Audio-Video detection of the active speaker in meetings",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "2536-2543",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275559"}

@inproceedings{bb280860,
        AUTHOR = "Tellamekala, M.K. and Valstar, M. and Pound, M. and Giesbrecht, T.",
        TITLE = "Audio-Visual Predictive Coding for Self-Supervised Visual
Representation Learning",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "9912-9919",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275560"}

@inproceedings{bb280861,
        AUTHOR = "Liu, H. and Wang, Y. and Yang, B.",
        TITLE = "Mutual Alignment between Audiovisual Features for End-to-End
Audiovisual Speech Recognition",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "5348-5353",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275561"}

@inproceedings{bb280862,
        AUTHOR = "Liu, H. and Xu, W.L. and Yang, B.",
        TITLE = "Audio-Visual Speech Recognition Using A Two-Step Feature Fusion
Strategy",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "1896-1903",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275562"}

@inproceedings{bb280863,
        AUTHOR = "Liu, H. and Li, W.H. and Yang, B.",
        TITLE = "Robust Audio-Visual Speech Recognition Based on Hybrid Fusion",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "7580-7586",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275563"}

@inproceedings{bb280864,
        AUTHOR = "Chao, F.Y. and Ozcinar, C. and Zhang, L. and Hamidouche, W. and Deforges, O. and Smolic, A.",
        TITLE = "Towards Audio-Visual Saliency Prediction for Omnidirectional Video
with Spatial Audio",
        BOOKTITLE = VCIP20,
        YEAR = "2020",
        PAGES = "355-358",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275564"}

@inproceedings{bb280865,
        AUTHOR = "Zhou, H. and Xu, X.D. and Lin, D. and Wang, X.G. and Liu, Z.W.",
        TITLE = "Sep-stereo: Visually Guided Stereophonic Audio Generation by
Associating Source Separation",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XII: 52-69",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275565"}

@inproceedings{bb280866,
        AUTHOR = "Tian, Y.P. and Li, D.Z. and Xu, C.L.",
        TITLE = "Unified Multisensory Perception: Weakly-supervised Audio-visual Video
Parsing",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "III:436-454",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275566"}

@inproceedings{bb280867,
        AUTHOR = "Salman, A.N. and Busso, C.",
        TITLE = "Dynamic versus Static Facial Expressions in the Presence of Speech",
        BOOKTITLE = FG20,
        YEAR = "2020",
        PAGES = "436-443",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275567"}

@inproceedings{bb280868,
        AUTHOR = "Salman, A.N. and Busso, C. and Salman, A.N. and Busso, C.",
        TITLE = "Style Extractor For Facial Expression Recognition in the Presence of
Speech",
        BOOKTITLE = ICIP20,
        YEAR = "2020",
        PAGES = "1806-1810",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275568"}

@inproceedings{bb280869,
        AUTHOR = "Liu, Y.F. and Qiao, M.L. and Xu, M. and Li, B. and Hu, W.M. and Borji, A.",
        TITLE = "Learning to Predict Salient Faces: A Novel Visual-Audio Saliency Model",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XX:413-429",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275569"}

@inproceedings{bb280870,
        AUTHOR = "Yang, K. and Russell, B. and Salamon, J.",
        TITLE = "Telling Left From Right:
Learning Spatial Correspondence of Sight and Sound",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "9929-9938",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275570"}

@inproceedings{bb280871,
        AUTHOR = "Gao, R. and Oh, T. and Grauman, K. and Torresani, L.",
        TITLE = "Listen to Look: Action Recognition by Previewing Audio",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10454-10464",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275571"}

@inproceedings{bb280872,
        AUTHOR = "Zhang, X. and Wu, X. and Zhai, X. and Ben, X. and Tu, C.",
        TITLE = "DAVD-Net: Deep Audio-Aided Video Decompression of Talking Heads",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "12332-12341",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275572"}

@inproceedings{bb280873,
        AUTHOR = "Vaezi Joze, H.R. and Shaban, A. and Iuzzolino, M.L. and Koishida, K.",
        TITLE = "MMTM: Multimodal Transfer Module for CNN Fusion",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "13286-13296",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275573"}

@inproceedings{bb280874,
        AUTHOR = "Alcazar, J.L. and Caba, F. and Mai, L. and Perazzi, F. and Lee, J. and Arbelaez, P. and Ghanem, B.",
        TITLE = "Active Speakers in Context",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "12462-12471",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275574"}

@inproceedings{bb280875,
        AUTHOR = "Huang, C. and Koishida, K.",
        TITLE = "Improved Active Speaker Detection based on Optical Flow",
        BOOKTITLE = MULWS20,
        YEAR = "2020",
        PAGES = "4084-4090",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275575"}

@inproceedings{bb280876,
        AUTHOR = "Ma, X.J. and Wu, C.C. and Li, Y.Y. and Zhong, Q.Y.",
        TITLE = "Speaker Identification System Based on Lip-Motion Feature",
        BOOKTITLE = CVS17,
        YEAR = "2017",
        PAGES = "289-299",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275576"}

@inproceedings{bb280877,
        AUTHOR = "Xu, B. and Lu, C. and Guo, Y. and Wang, J.",
        TITLE = "Discriminative Multi-Modality Speech Recognition",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "14421-14430",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275577"}

@inproceedings{bb280878,
        AUTHOR = "Subedar, M. and Krishnan, R. and Meyer, P.L. and Tickoo, O. and Huang, J.",
        TITLE = "Uncertainty-Aware Audiovisual Activity Recognition Using Deep
Bayesian Variational Inference",
        BOOKTITLE = ICCV19,
        YEAR = "2019",
        PAGES = "6300-6309",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275578"}

@inproceedings{bb280879,
        AUTHOR = "Alamri, H. and Cartillier, V. and Das, A. and Wang, J. and Cherian, A. and Essa, I. and Batra, D. and Marks, T.K. and Hori, C. and Anderson, P. and Lee, S. and Parikh, D.",
        TITLE = "Audio Visual Scene-Aware Dialog",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "7550-7559",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275579"}

@inproceedings{bb280880,
        AUTHOR = "Niu, Y.L. and Zhang, H.W. and Zhang, M.L. and Zhang, J.H. and Lu, Z.W. and Wen, J.R.",
        TITLE = "Recursive Visual Attention in Visual Dialog",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "6672-6681",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275580"}

@inproceedings{bb280881,
        AUTHOR = "Schwartz, I. and Schwing, A.G. and Hazan, T.",
        TITLE = "A Simple Baseline for Audio-Visual Scene-Aware Dialog",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "12540-12550",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275581"}

@inproceedings{bb280882,
        AUTHOR = "Lu, Y. and Lee, H. and Tseng, H. and Yang, M.",
        TITLE = "Self-Supervised Audio Spatialization with Correspondence Classifier",
        BOOKTITLE = ICIP19,
        YEAR = "2019",
        PAGES = "3347-3351",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275582"}

@inproceedings{bb280883,
        AUTHOR = "Meng, D. and Peng, X. and Wang, K. and Qiao, Y.",
        TITLE = "Frame Attention Networks for Facial Expression Recognition in Videos",
        BOOKTITLE = ICIP19,
        YEAR = "2019",
        PAGES = "3866-3870",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275583"}

@inproceedings{bb280884,
        AUTHOR = "Shahid, M. and Beyan, C. and Murino, V.",
        TITLE = "Comparisons of Visual Activity Primitives for Voice Activity Detection",
        BOOKTITLE = CIAP19,
        YEAR = "2019",
        PAGES = "I:48-59",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275584"}

@inproceedings{bb280885,
        AUTHOR = "Kim, C.I. and Shin, H.J.V. and Oh, T.H. and Kaspar, A. and Elgharib, M. and Matusik, W.",
        TITLE = "On Learning Associations of Faces and Voices",
        BOOKTITLE = ACCV18,
        YEAR = "2018",
        PAGES = "V:276-292",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275585"}

@inproceedings{bb280886,
        AUTHOR = "Schindler, A. and Boyer, M. and Lindley, A. and Schreiber, D. and Philipp, T.",
        TITLE = "Large Scale Audio-Visual Video Analytics Platform for Forensic
Investigations of Terroristic Attacks",
        BOOKTITLE = "MMMod19",
        YEAR = "2019",
        PAGES = "II:106-119",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275586"}

@inproceedings{bb280887,
        AUTHOR = "Oliveira, D.A.B. and Mattos, A.B. and da Silva Morais, E.",
        TITLE = "Improving Viseme Recognition Using GAN-Based Frontal View Mapping",
        BOOKTITLE = AMFG18,
        YEAR = "2018",
        PAGES = "2229-22297",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275587"}

@inproceedings{bb280888,
        AUTHOR = "Yang, X. and Molchanov, P. and Kautz, J.",
        TITLE = "Making Convolutional Networks Recurrent for Visual Sequence Learning",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "6469-6478",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275588"}

@inproceedings{bb280889,
        AUTHOR = "Zhang, J. and Richmond, K. and Fisher, R.B.",
        TITLE = "Dual-modality Talking-metrics: 3D Visual-Audio Integrated
Behaviometric Cues from Speakers",
        BOOKTITLE = ICPR18,
        YEAR = "2018",
        PAGES = "3144-3149",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275589"}

@inproceedings{bb280890,
        AUTHOR = "Chowdhury, A. and Atoum, Y. and Tran, L. and Liu, X. and Ross, A.",
        TITLE = "MSU-AVIS dataset: Fusing Face and Voice Modalities for Biometric
Recognition in Indoor Surveillance Videos",
        BOOKTITLE = ICPR18,
        YEAR = "2018",
        PAGES = "3567-3573",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275590"}

@inproceedings{bb280891,
        AUTHOR = "Nagrani, A. and Albanie, S. and Zisserman, A.",
        TITLE = "Seeing Voices and Hearing Faces: Cross-Modal Biometric Matching",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "8427-8436",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275591"}

@inproceedings{bb280892,
        AUTHOR = "Saitoh, T. and Kubokawa, M.",
        TITLE = "SSSD: Speech Scene database by Smart Device for Visual Speech
Recognition",
        BOOKTITLE = ICPR18,
        YEAR = "2018",
        PAGES = "3228-3232",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275592"}

@inproceedings{bb280893,
        AUTHOR = "Owens, A. and Efros, A.A.",
        TITLE = "Audio-Visual Scene Analysis with Self-Supervised Multisensory Features",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "VI: 639-658",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275593"}

@inproceedings{bb280894,
        AUTHOR = "Berlin, A.A. and Surati, R.",
        TITLE = "Video Deconfounding: Hearing-Aid Inspired Video Enhancement",
        BOOKTITLE = IVMSP18,
        YEAR = "2018",
        PAGES = "1-5",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275594"}

@inproceedings{bb280895,
        AUTHOR = "Ding, R. and Pang, C. and Liu, H.",
        TITLE = "Audio-Visual Keyword Spotting Based on Multidimensional Convolutional
Neural Network",
        BOOKTITLE = ICIP18,
        YEAR = "2018",
        PAGES = "4138-4142",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275595"}

@inproceedings{bb280896,
        AUTHOR = "Liao, J. and Wang, S. and Zhang, X. and Liu, G.",
        TITLE = "3D Convolutional Neural Networks Based Speaker Identification and
Authentication",
        BOOKTITLE = ICIP18,
        YEAR = "2018",
        PAGES = "2042-2046",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275596"}

@inproceedings{bb280897,
        AUTHOR = "Savran, A. and Tavarone, R. and Higy, B. and Badino, L. and Bartolozzi, C.",
        TITLE = "Energy and Computation Efficient Audio-Visual Voice Activity
Detection Driven by Event-Cameras",
        BOOKTITLE = FG18,
        YEAR = "2018",
        PAGES = "333-340",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275597"}

@inproceedings{bb280898,
        AUTHOR = "Ephrat, A. and Halperin, T. and Peleg, S.",
        TITLE = "Improved Speech Reconstruction from Silent Video",
        BOOKTITLE = CVAVM17,
        YEAR = "2017",
        PAGES = "455-462",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275598"}

@inproceedings{bb280899,
        AUTHOR = "Ahn, J. and Kim, Y.J. and Kim, D.J.",
        TITLE = "Patch-based visual microphone for improving quality of sound",
        BOOKTITLE = ICPR16,
        YEAR = "2016",
        PAGES = "3927-3932",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT275599"}

Last update:Sep 10, 2025 at 12:00:25