euiia commited on
Commit
7866a75
·
verified ·
1 Parent(s): 8434eb9

Update managers/audio_specialist.py

Browse files
Files changed (1) hide show
  1. managers/audio_specialist.py +104 -55
managers/audio_specialist.py CHANGED
@@ -1,6 +1,13 @@
1
  # audio_specialist.py
2
- # Especialista ADUC para geração de áudio, com gerenciamento de memória GPU.
3
- # Copyright (C) 4 de Agosto de 2025 Carlos Rodrigues dos Santos
 
 
 
 
 
 
 
4
 
5
  import torch
6
  import logging
@@ -11,23 +18,20 @@ import yaml
11
  import gc
12
  from pathlib import Path
13
  import gradio as gr
14
-
15
- # Importa as classes e funções necessárias do MMAudio
16
- try:
17
- from mmaudio.eval_utils import ModelConfig, all_model_cfg, generate as mmaudio_generate, load_video, make_video
18
- from mmaudio.model.flow_matching import FlowMatching
19
- from mmaudio.model.networks import MMAudio, get_my_mmaudio
20
- from mmaudio.model.utils.features_utils import FeaturesUtils
21
- from mmaudio.model.sequence_config import SequenceConfig
22
- except ImportError:
23
- raise ImportError("MMAudio não foi encontrado. Por favor, instale-o a partir do GitHub: git+https://github.com/hkchengrex/MMAudio.git")
24
 
25
  logger = logging.getLogger(__name__)
26
 
 
 
 
 
 
27
  class AudioSpecialist:
28
  """
29
- Especialista responsável por gerar áudio para fragmentos de vídeo.
30
- Gerencia o carregamento e descarregamento de modelos de áudio da VRAM.
 
31
  """
32
  def __init__(self, workspace_dir):
33
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -35,26 +39,84 @@ class AudioSpecialist:
35
  self.dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
36
  self.workspace_dir = workspace_dir
37
 
38
- self.model_config: ModelConfig = all_model_cfg['large_44k_v2']
39
- self.net: MMAudio = None
40
- self.feature_utils: FeaturesUtils = None
41
- self.seq_cfg: SequenceConfig = None
 
 
 
 
42
 
43
  self._load_models_to_cpu()
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def _load_models_to_cpu(self):
46
- """Carrega os modelos MMAudio para a memória da CPU na inicialização."""
47
  try:
48
- logger.info("Verificando e baixando modelos MMAudio, se necessário...")
 
49
  self.model_config.download_if_needed()
50
 
51
  self.seq_cfg = self.model_config.seq_cfg
52
 
53
- logger.info(f"Carregando modelo MMAudio: {self.model_config.model_name} para a CPU...")
54
  self.net = get_my_mmaudio(self.model_config.model_name).eval()
55
  self.net.load_weights(torch.load(self.model_config.model_path, map_location=self.cpu_device, weights_only=True))
56
 
57
- logger.info("Carregando utilitários de features do MMAudio para a CPU...")
58
  self.feature_utils = FeaturesUtils(
59
  tod_vae_ckpt=self.model_config.vae_path,
60
  synchformer_ckpt=self.model_config.synchformer_ckpt,
@@ -66,22 +128,22 @@ class AudioSpecialist:
66
  self.feature_utils = self.feature_utils.eval()
67
  self.net.to(self.cpu_device)
68
  self.feature_utils.to(self.cpu_device)
69
- logger.info("Especialista de áudio pronto na CPU.")
70
  except Exception as e:
71
- logger.error(f"Falha ao carregar modelos de áudio: {e}", exc_info=True)
72
  self.net = None
73
 
74
  def to_gpu(self):
75
- """Move os modelos e utilitários para a GPU antes da inferência."""
76
  if self.device == 'cpu': return
77
- logger.info(f"Movendo especialista de áudio para a GPU ({self.device})...")
78
  self.net.to(self.device, self.dtype)
79
  self.feature_utils.to(self.device, self.dtype)
80
 
81
  def to_cpu(self):
82
- """Move os modelos de volta para a CPU e limpa a VRAM após a inferência."""
83
  if self.device == 'cpu': return
84
- logger.info("Descarregando especialista de áudio da GPU...")
85
  self.net.to(self.cpu_device)
86
  self.feature_utils.to(self.cpu_device)
87
  gc.collect()
@@ -89,35 +151,24 @@ class AudioSpecialist:
89
 
90
  def generate_audio_for_video(self, video_path: str, prompt: str, duration_seconds: float, output_path_override: str = None) -> str:
91
  """
92
- Gera áudio para um arquivo de vídeo, aplicando um prompt negativo para evitar fala.
93
-
94
- Args:
95
- video_path (str): Caminho para o vídeo silencioso.
96
- prompt (str): Descrição da cena para guiar a geração de SFX.
97
- duration_seconds (float): Duração do áudio a ser gerado.
98
-
99
- Returns:
100
- str: Caminho para o novo arquivo de vídeo com áudio.
101
  """
102
  if self.net is None:
103
- raise gr.Error("Modelo MMAudio não está carregado. Não é possível gerar áudio.")
104
 
105
- logger.info("------------------------------------------------------")
106
- logger.info("--- Gerando Áudio para Fragmento de Vídeo ---")
107
- logger.info(f"--- Vídeo Fragmento: {os.path.basename(video_path)}")
108
- logger.info(f"--- Duração: {duration_seconds:.2f}s")
109
- logger.info(f"--- Prompt (Descrição da Cena): '{prompt}'")
110
 
111
- negative_prompt = "human voice"
112
- logger.info(f"--- Negative Prompt: '{negative_prompt}'")
113
 
114
  if duration_seconds < 1:
115
- logger.warning("Fragmento muito curto (<1s). Retornando vídeo silencioso.")
116
- logger.info("------------------------------------------------------")
117
  return video_path
118
-
119
  if self.device == 'cpu':
120
- logger.warning("Gerando áudio na CPU. Isso pode ser muito lento.")
121
 
122
  try:
123
  self.to_gpu()
@@ -142,12 +193,10 @@ class AudioSpecialist:
142
  )
143
  audio_waveform = audios.float().cpu()[0]
144
 
145
- fragment_name = Path(video_path).stem
146
- output_video_path = output_path_override if output_path_override else os.path.join(self.workspace_dir, f"{fragment_name}_com_audio.mp4")
147
-
148
  make_video(video_info, Path(output_video_path), audio_waveform, sampling_rate=self.seq_cfg.sampling_rate)
149
- logger.info(f"--- Fragmento com áudio salvo em: {os.path.basename(output_video_path)}")
150
- logger.info("------------------------------------------------------")
151
  return output_video_path
152
  finally:
153
  self.to_cpu()
@@ -159,5 +208,5 @@ try:
159
  WORKSPACE_DIR = config['application']['workspace_dir']
160
  audio_specialist_singleton = AudioSpecialist(workspace_dir=WORKSPACE_DIR)
161
  except Exception as e:
162
- logger.error(f"Não foi possível inicializar o AudioSpecialist: {e}", exc_info=True)
163
  audio_specialist_singleton = None
 
1
  # audio_specialist.py
2
+ #
3
+ # Copyright (C) August 4, 2025 Carlos Rodrigues dos Santos
4
+ #
5
+ # Version: 2.2.0
6
+ #
7
+ # This file defines the Audio Specialist for the ADUC-SDR framework. It is responsible
8
+ # for generating audio synchronized with video clips. This version has been refactored
9
+ # to be self-contained by automatically cloning the MMAudio dependency from its
10
+ # official repository, making the framework more portable and easier to set up.
11
 
12
  import torch
13
  import logging
 
18
  import gc
19
  from pathlib import Path
20
  import gradio as gr
21
+ import sys
 
 
 
 
 
 
 
 
 
22
 
23
  logger = logging.getLogger(__name__)
24
 
25
+ # --- Dependency Management ---
26
+ DEPS_DIR = Path("./deps")
27
+ MMAUDIO_REPO_DIR = DEPS_DIR / "MMAudio"
28
+ MMAUDIO_REPO_URL = "https://github.com/hkchengrex/MMAudio.git"
29
+
30
  class AudioSpecialist:
31
  """
32
+ Specialist responsible for generating audio for video fragments.
33
+ Manages the loading and unloading of audio models from VRAM and handles
34
+ its own code dependencies by cloning the MMAudio repository.
35
  """
36
  def __init__(self, workspace_dir):
37
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
39
  self.dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
40
  self.workspace_dir = workspace_dir
41
 
42
+ self._mmaudio_modules_loaded = False
43
+ self._setup_dependencies()
44
+ self._lazy_load_mmaudio_modules()
45
+
46
+ self.model_config: 'ModelConfig' = self.all_model_cfg['large_44k_v2']
47
+ self.net: 'MMAudio' = None
48
+ self.feature_utils: 'FeaturesUtils' = None
49
+ self.seq_cfg: 'SequenceConfig' = None
50
 
51
  self._load_models_to_cpu()
52
 
53
+ def _setup_dependencies(self):
54
+ """
55
+ Checks for the MMAudio repository locally. If not found, clones it.
56
+ Then, it adds the repository to the Python path to make its modules importable.
57
+ """
58
+ if not MMAUDIO_REPO_DIR.exists():
59
+ logger.info(f"MMAudio repository not found at '{MMAUDIO_REPO_DIR}'. Cloning from GitHub...")
60
+ try:
61
+ DEPS_DIR.mkdir(exist_ok=True)
62
+ subprocess.run(
63
+ ["git", "clone", MMAUDIO_REPO_URL, str(MMAUDIO_REPO_DIR)],
64
+ check=True, capture_output=True, text=True
65
+ )
66
+ logger.info("MMAudio repository cloned successfully.")
67
+ except subprocess.CalledProcessError as e:
68
+ logger.error(f"Failed to clone MMAudio repository. Git stderr: {e.stderr}")
69
+ raise RuntimeError("Could not clone the required MMAudio dependency from GitHub.")
70
+ else:
71
+ logger.info("Found local MMAudio repository.")
72
+
73
+ if str(MMAUDIO_REPO_DIR.resolve()) not in sys.path:
74
+ sys.path.insert(0, str(MMAUDIO_REPO_DIR.resolve()))
75
+ logger.info(f"Added '{MMAUDIO_REPO_DIR.resolve()}' to sys.path.")
76
+
77
+ def _lazy_load_mmaudio_modules(self):
78
+ """Dynamically imports MMAudio modules only when needed."""
79
+ if self._mmaudio_modules_loaded:
80
+ return
81
+
82
+ # These globals are now populated by the lazy loader
83
+ global ModelConfig, all_model_cfg, mmaudio_generate, load_video, make_video
84
+ global FlowMatching, MMAudio, get_my_mmaudio, FeaturesUtils, SequenceConfig
85
+
86
+ from mmaudio.eval_utils import ModelConfig, all_model_cfg, generate as mmaudio_generate, load_video, make_video
87
+ from mmaudio.model.flow_matching import FlowMatching
88
+ from mmaudio.model.networks import MMAudio, get_my_mmaudio
89
+ from mmaudio.model.utils.features_utils import FeaturesUtils
90
+ from mmaudio.model.sequence_config import SequenceConfig
91
+
92
+ self.all_model_cfg = all_model_cfg
93
+ self._mmaudio_modules_loaded = True
94
+ logger.info("MMAudio modules have been dynamically loaded.")
95
+
96
+ def _adjust_paths_for_repo(self):
97
+ """Adjusts the checkpoint paths in the model config to point inside the cloned repo."""
98
+ for cfg_key in self.all_model_cfg:
99
+ cfg = self.all_model_cfg[cfg_key]
100
+ cfg.model_path = MMAUDIO_REPO_DIR / cfg.model_path
101
+ cfg.vae_path = MMAUDIO_REPO_DIR / cfg.vae_path
102
+ if cfg.bigvgan_16k_path is not None:
103
+ cfg.bigvgan_16k_path = MMAUDIO_REPO_DIR / cfg.bigvgan_16k_path
104
+ cfg.synchformer_ckpt = MMAUDIO_REPO_DIR / cfg.synchformer_ckpt
105
+
106
  def _load_models_to_cpu(self):
107
+ """Loads the MMAudio models to CPU memory on initialization."""
108
  try:
109
+ self._adjust_paths_for_repo()
110
+ logger.info("Verifying and downloading MMAudio models, if necessary...")
111
  self.model_config.download_if_needed()
112
 
113
  self.seq_cfg = self.model_config.seq_cfg
114
 
115
+ logger.info(f"Loading MMAudio model: {self.model_config.model_name} to CPU...")
116
  self.net = get_my_mmaudio(self.model_config.model_name).eval()
117
  self.net.load_weights(torch.load(self.model_config.model_path, map_location=self.cpu_device, weights_only=True))
118
 
119
+ logger.info("Loading MMAudio feature utils to CPU...")
120
  self.feature_utils = FeaturesUtils(
121
  tod_vae_ckpt=self.model_config.vae_path,
122
  synchformer_ckpt=self.model_config.synchformer_ckpt,
 
128
  self.feature_utils = self.feature_utils.eval()
129
  self.net.to(self.cpu_device)
130
  self.feature_utils.to(self.cpu_device)
131
+ logger.info("Audio Specialist ready on CPU.")
132
  except Exception as e:
133
+ logger.error(f"Failed to load audio models: {e}", exc_info=True)
134
  self.net = None
135
 
136
  def to_gpu(self):
137
+ """Moves the models and utilities to the GPU before inference."""
138
  if self.device == 'cpu': return
139
+ logger.info(f"Moving Audio Specialist to GPU ({self.device})...")
140
  self.net.to(self.device, self.dtype)
141
  self.feature_utils.to(self.device, self.dtype)
142
 
143
  def to_cpu(self):
144
+ """Moves the models back to CPU and clears VRAM after inference."""
145
  if self.device == 'cpu': return
146
+ logger.info("Unloading Audio Specialist from GPU...")
147
  self.net.to(self.cpu_device)
148
  self.feature_utils.to(self.cpu_device)
149
  gc.collect()
 
151
 
152
  def generate_audio_for_video(self, video_path: str, prompt: str, duration_seconds: float, output_path_override: str = None) -> str:
153
  """
154
+ Generates audio for a video file, applying a negative prompt to avoid speech.
 
 
 
 
 
 
 
 
155
  """
156
  if self.net is None:
157
+ raise gr.Error("MMAudio model is not loaded. Cannot generate audio.")
158
 
159
+ logger.info("--- Generating Audio for Video Fragment ---")
160
+ logger.info(f"--- Video: {os.path.basename(video_path)}")
161
+ logger.info(f"--- Duration: {duration_seconds:.2f}s")
 
 
162
 
163
+ negative_prompt = "human voice, speech, talking, singing, narration"
164
+ logger.info(f"--- Prompt: '{prompt}' | Negative Prompt: '{negative_prompt}'")
165
 
166
  if duration_seconds < 1:
167
+ logger.warning("Fragment too short (<1s). Returning original video.")
 
168
  return video_path
169
+
170
  if self.device == 'cpu':
171
+ logger.warning("Generating audio on CPU. This may be very slow.")
172
 
173
  try:
174
  self.to_gpu()
 
193
  )
194
  audio_waveform = audios.float().cpu()[0]
195
 
196
+ output_video_path = output_path_override if output_path_override else os.path.join(self.workspace_dir, f"{Path(video_path).stem}_with_audio.mp4")
197
+
 
198
  make_video(video_info, Path(output_video_path), audio_waveform, sampling_rate=self.seq_cfg.sampling_rate)
199
+ logger.info(f"--- Fragment with audio saved to: {os.path.basename(output_video_path)}")
 
200
  return output_video_path
201
  finally:
202
  self.to_cpu()
 
208
  WORKSPACE_DIR = config['application']['workspace_dir']
209
  audio_specialist_singleton = AudioSpecialist(workspace_dir=WORKSPACE_DIR)
210
  except Exception as e:
211
+ logger.error(f"Could not initialize AudioSpecialist: {e}", exc_info=True)
212
  audio_specialist_singleton = None