@inproceedings{Liu_2025_CVPR,abbr={CVPR},bibtex_show={true},code={https://github.com/DragonLiu1995/xRIR_code},author={Liu, Xiulong and Kumar, Anurag and Calamia, Paul and Amengual, Sebastia V. and Murdock, Calvin and Ananthabhotla, Ishwarya and Robinson, Philip and Shlizerman, Eli and Ithapu, Vamsi Krishna and Gao, Ruohan},title={Hearing Anywhere in Any Environment},booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR)},month=jun,year={2025},pages={5732-5741},selected={true}}
2024
NEURIPS
Tell What You Hear From What You See - Video to Audio Generation Through Text
Liu, Xiulong,
Su, Kun,
and Shlizerman, Eli
In Advances in Neural Information Processing Systems
2024
@inproceedings{liu2024vatt,abbr={NEURIPS},bibtex_show={true},website={https://dragonliu1995.github.io/VATT-home/},code={https://github.com/DragonLiu1995/video-to-audio-through-text},author={Liu, Xiulong and Su, Kun and Shlizerman, Eli},booktitle={Advances in Neural Information Processing Systems},editor={Globerson, A. and Mackey, L. and Belgrave, D. and Fan, A. and Paquet, U. and Tomczak, J. and Zhang, C.},pages={101337--101366},publisher={Curran Associates, Inc.},title={Tell What You Hear From What You See - Video to Audio Generation Through Text},url={https://proceedings.neurips.cc/paper_files/paper/2024/file/b782a3462ee9d566291cff148333ea9b-Paper-Conference.pdf},volume={37},year={2024},selected={true}}
ML4PHYS
Calo-VQ: Vector-quantized two-stage generative model in calorimeter simulation
@article{liu2024calo,abbr={ML4PHYS},bibtex_show={true},title={Calo-VQ: Vector-quantized two-stage generative model in calorimeter simulation},author={Liu, Qibin and Shimmin, Chase and Liu, Xiulong and Shlizerman, Eli and Li, Shu and Hsu, Shih-Chieh},journal={arXiv preprint arXiv:2405.06605},year={2024}}
ICML
From Vision to Audio and Beyond: A Unified Model for Audio-Visual Representation and Generation
Su, Kun,
Liu, Xiulong,
and Shlizerman, Eli
In Proceedings of the 41st International Conference on Machine Learning
21–27 jul
2024
Video encompasses both visual and auditory data, creating a perceptually rich experience where these two modalities complement each other. As such, videos are a valuable type of media for the investigation of the interplay between audio and visual elements. Previous studies of audio-visual modalities primarily focused on either audio-visual representation learning or generative modeling of a modality conditioned on the other, creating a disconnect between these two branches. A unified framework that learns representation and generates modalities has not been developed yet. In this work, we introduce a novel framework called Vision to Audio and Beyond (VAB) to bridge the gap between audio-visual representation learning and vision-to-audio generation. The key approach of VAB is that rather than working with raw video frames and audio data, VAB performs representation learning and generative modeling within latent spaces. In particular, VAB uses a pre-trained audio tokenizer and an image encoder to obtain audio tokens and visual features, respectively. It then performs the pre-training task of visual-conditioned masked audio token prediction. This training strategy enables the model to engage in contextual learning and simultaneous video-to-audio generation. After the pre-training phase, VAB employs the iterative-decoding approach to rapidly generate audio tokens conditioned on visual features. Since VAB is a unified model, its backbone can be fine-tuned for various audio-visual downstream tasks. Our experiments showcase the efficiency of VAB in producing high-quality audio from video, and its capability to acquire semantic audio-visual features, leading to competitive results in audio-visual retrieval and classification.
@inproceedings{su2024vision,abbr={ICML},bibtex_show={true},title={From Vision to Audio and Beyond: A Unified Model for Audio-Visual Representation and Generation},author={Su, Kun and Liu, Xiulong and Shlizerman, Eli},booktitle={Proceedings of the 41st International Conference on Machine Learning},pages={46804--46822},year={2024},editor={Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix},volume={235},series={Proceedings of Machine Learning Research},month={21--27 Jul},selected={true},publisher={PMLR}}
CVPR
MuseChat: A Conversational Music Recommendation System for Videos
@inproceedings{Dong_2024_CVPR,abbr={CVPR},bibtex_show={true},website={https://dongzhikang.github.io/musechat/},author={Dong, Zhikang and Liu, Xiulong and Chen, Bin and Polak, Pawel and Zhang, Peng},title={MuseChat: A Conversational Music Recommendation System for Videos},booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},month=jun,year={2024},pages={12775-12785},selected={true}}
AAAI
CAVEN: An Embodied Conversational Agent for Efficient Audio-Visual Navigation in Noisy Environments
Liu, Xiulong,
Paul, Sudipta,
Chatterjee, Moitreya,
and Cherian, Anoop
Proceedings of the AAAI Conference on Artificial Intelligence
Mar
2024
@article{Liu_Paul_Chatterjee_Cherian_2024,abbr={AAAI},title={CAVEN: An Embodied Conversational Agent for Efficient Audio-Visual Navigation in Noisy Environments},bibtex_show={true},volume={38},url={https://ojs.aaai.org/index.php/AAAI/article/view/28167},doi={10.1609/aaai.v38i4.28167},abstractnote={Audio-visual navigation of an agent towards locating an audio goal is a challenging task especially when the audio is sporadic or the environment is noisy. In this paper, we present CAVEN, a Conversation-based Audio-Visual Embodied Navigation framework in which the agent may interact with a human/oracle for solving the task of navigating to an audio goal. Specifically, CAVEN is modeled as a budget-aware partially observable semi-Markov decision process that implicitly learns the uncertainty in the audio-based navigation policy to decide when and how the agent may interact with the oracle. Our CAVEN agent can engage in fully-bidirectional natural language conversations by producing relevant questions and interpret free-form, potentially noisy responses from the oracle based on the audio-visual context. To enable such a capability, CAVEN is equipped with: i) a trajectory forecasting network that is grounded in audio-visual cues to produce a potential trajectory to the estimated goal, and (ii) a natural language based question generation and reasoning network to pose an interactive question to the oracle or interpret the oracle’s response to produce navigation instructions. To train the interactive modules, we present a large scale dataset: AVN-Instruct, based on the Landmark-RxR dataset. To substantiate the usefulness of conversations, we present experiments on the benchmark audio-goal task using the SoundSpaces simulator under various noisy settings. Our results reveal that our fully-conversational approach leads to nearly an order-of-magnitude improvement in success rate, especially in localizing new sound sources and against methods that use only uni-directional interaction.},number={4},journal={Proceedings of the AAAI Conference on Artificial Intelligence},author={Liu, Xiulong and Paul, Sudipta and Chatterjee, Moitreya and Cherian, Anoop},year={2024},month=mar,pages={3765-3773},selected={true}}
WACV
Let the Beat Follow You - Creating Interactive Drum Sounds From Body Rhythm
Liu, Xiulong,
Su, Kun,
and Shlizerman, Eli
In Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)
Jan
2024
@inproceedings{Liu_2024_WACV,abbr={WACV},bibtex_show={true},author={Liu, Xiulong and Su, Kun and Shlizerman, Eli},title={Let the Beat Follow You - Creating Interactive Drum Sounds From Body Rhythm},booktitle={Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)},month=jan,year={2024},pages={7187-7197},selected={true}}
WACV
Tackling Data Bias in MUSIC-AVQA: Crafting a Balanced Dataset for Unbiased Question-Answering
Liu, Xiulong,
Dong, Zhikang,
and Zhang, Peng
In Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision
2024
@inproceedings{liu2024tackling,abbr={WACV},bibtex_show={true},title={Tackling Data Bias in MUSIC-AVQA: Crafting a Balanced Dataset for Unbiased Question-Answering},author={Liu, Xiulong and Dong, Zhikang and Zhang, Peng},booktitle={Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision},pages={4478--4487},year={2024},selected={true}}
2021
NEURIPS
How Does it Sound? Generation of Rhythmic Soundtracks for Human Movement Videos
Su, Kun*,
Liu, Xiulong*,
and Shlizerman, Eli
Advances in Neural Information Processing Systems
2021
@article{su2021howdoesitsound,abbr={NEURIPS},bibtex_show={true},code={https://github.com/shlizee/RhythmicNet},title={How Does it Sound? Generation of Rhythmic Soundtracks for Human Movement Videos},author={Su, Kun* and Liu, Xiulong* and Shlizerman, Eli},journal={Advances in Neural Information Processing Systems},year={2021},selected={true}}
2020
Arxiv
Multi-instrumentalist net: Unsupervised generation of music from body movements
@article{su2020multi,abbr={Arxiv},bibtex_show={true},title={Multi-instrumentalist net: Unsupervised generation of music from body movements},author={Su, Kun and Liu, Xiulong and Shlizerman, Eli},journal={arXiv preprint arXiv:2012.03478},year={2020}}
NEURIPS
Audeo: Audio generation for a silent performance video
Su, Kun,
Liu, Xiulong,
and Shlizerman, Eli
Advances in Neural Information Processing Systems
2020
@article{su2020audeo,abbr={NEURIPS},bibtex_show={true},code={https://github.com/shlizee/Audeo},title={Audeo: Audio generation for a silent performance video},author={Su, Kun and Liu, Xiulong and Shlizerman, Eli},journal={Advances in Neural Information Processing Systems},volume={33},year={2020},selected={true}}
CVPR
Predict & cluster: Unsupervised skeleton based action recognition
Su, Kun,
Liu, Xiulong,
and Shlizerman, Eli
In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition
2020
@inproceedings{su2020predict,abbr={CVPR},bibtex_show={true},code={https://github.com/shlizee/Predict-Cluster},title={Predict \& cluster: Unsupervised skeleton based action recognition},author={Su, Kun and Liu, Xiulong and Shlizerman, Eli},booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},pages={9631--9640},year={2020},selected={true}}