@inproceedings{ab7b8f6d589b4ad4864a0745b25702eb,
title = "Lang2Mol-Diff: A Diffusion-Based Generative Model for Language-to-Molecule Translation Leveraging SELFIES Molecular String Representation",
abstract = "Generating de novo molecules from textual descriptions is challenging due to potential issues with molecule validity in SMILES representation and limitations of autoregressive models. This work introduces Lang2Mol-Diff, a diffusion-based language-to-molecule generative model using the SELFIES representation. Specifically, Lang2Mol-Diff leverages the strengths of two state-of-the-art molecular generative models: BioT5 and TGM-DLM. By employing BioT5 to tokenize the SELFIES representation, Lang2Mol-Diff addresses the validity issues associated with SMILES strings. Additionally, it incorporates a text diffusion mechanism from TGM-DLM to overcome the limitations of autoregressive models in this domain. To the best of our knowledge, this is the first study to leverage the diffusion mechanism for text-based de novo molecule generation using the SELFIES molecular string representation. Performance evaluation on the L+M-24 benchmark dataset shows that Lang2Mol-Diff outperforms all existing methods for molecule generation in terms of validity. Our code and pre-processed data are available at https://github.com/nhattruongpham/mollang-bridge/tree/lang2mol/.",
author = "Nguyen, \{Nguyen Doan Hieu\} and Pham, \{Nhat Truong\} and Tran, \{Duong Thanh\} and Balachandran Manavalan",
note = "Publisher Copyright: {\textcopyright} 2024 Association for Computational Linguistics.; 1st Workshop on Language + Molecules, Lang + Mol 2024 - co-located with ACL 2024 ; Conference date: 15-08-2024",
year = "2024",
language = "English",
series = "Lang + Mol 2024 - 1st Workshop on Language + Molecules, Proceedings of the Workshop",
publisher = "Association for Computational Linguistics (ACL)",
pages = "129--135",
editor = "Carl Edwards and Qingyun Wang and Manling Li and Lawrence Zhao and Tom Hope and Heng Ji",
booktitle = "Lang + Mol 2024 - 1st Workshop on Language + Molecules, Proceedings of the Workshop",
}