from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess import torch import emoji device = "cuda:0" if torch.cuda.is_available() else "cpu" asr_model_path = r"/work/models/SenseVoiceSmall/" asr_model = AutoModel(model=asr_model_path, trust_remote_code=False, device=device, disable_update=True) class Audio2Text(): def __init__(self, audio_file_path): self.audio_path = audio_file_path def audo_to_text(self): res = asr_model.generate( input=self.audio_path, cache={}, language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" use_itn=True, # batch_size_s=180, merge_vad=True, # # merge_length_s=15, ncpu=4 ) text = rich_transcription_postprocess(res[0]["text"]) text = ''.join(char for char in text if char not in emoji.EMOJI_DATA) print(f"audio2text result: {text}") return text