Aduc-sdr-cinematic-video

Runtime error

App Files Files Community

Carlexxx commited on Aug 27

Commit

af382fb

1 Parent(s): c0e5fc7

aduc-sdr

Browse files

Files changed (2) hide show

audio_specialist.py +25 -3
prompts/sound_director_prompt.txt.txt +27 -0

audio_specialist.py CHANGED Viewed

@@ -1,5 +1,6 @@
-# audio_specialist.py (Versão final para áudio dinâmico por fragmento)
 # Especialista ADUC para geração de áudio, com gerenciamento de memória GPU.
 import torch
 import logging
@@ -24,6 +25,10 @@ except ImportError:
 logger = logging.getLogger(__name__)
 class AudioSpecialist:
     def __init__(self, workspace_dir):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.cpu_device = torch.device("cpu")
@@ -38,6 +43,7 @@ class AudioSpecialist:
         self._load_models_to_cpu()
     def _load_models_to_cpu(self):
         try:
             logger.info("Verificando e baixando modelos MMAudio, se necessário...")
             self.model_config.download_if_needed()
@@ -66,12 +72,14 @@ class AudioSpecialist:
             self.net = None
     def to_gpu(self):
         if self.device == 'cpu': return
         logger.info(f"Movendo especialista de áudio para a GPU ({self.device})...")
         self.net.to(self.device, self.dtype)
         self.feature_utils.to(self.device, self.dtype)
     def to_cpu(self):
         if self.device == 'cpu': return
         logger.info("Descarregando especialista de áudio da GPU...")
         self.net.to(self.cpu_device)
@@ -79,7 +87,18 @@ class AudioSpecialist:
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
-    def generate_audio_for_video(self, video_path: str, prompt: str, negative_prompt: str, duration_seconds: float) -> str:
         if self.net is None:
             raise gr.Error("Modelo MMAudio não está carregado. Não é possível gerar áudio.")
@@ -89,6 +108,9 @@ class AudioSpecialist:
         logger.info(f"--- Duração: {duration_seconds:.2f}s")
         logger.info(f"--- Prompt (Descrição da Cena): '{prompt}'")
         if duration_seconds < 1:
             logger.warning("Fragmento muito curto (<1s). Retornando vídeo silencioso.")
             logger.info("------------------------------------------------------")
@@ -137,5 +159,5 @@ try:
     WORKSPACE_DIR = config['application']['workspace_dir']
     audio_specialist_singleton = AudioSpecialist(workspace_dir=WORKSPACE_DIR)
 except Exception as e:
-    logger.error(f"Não foi possível inicializar o AudioSpecialist: {e}")
     audio_specialist_singleton = None

+# audio_specialist.py
 # Especialista ADUC para geração de áudio, com gerenciamento de memória GPU.
+# Copyright (C) 4 de Agosto de 2025  Carlos Rodrigues dos Santos
 import torch
 import logging
 logger = logging.getLogger(__name__)
 class AudioSpecialist:
+    """
+    Especialista responsável por gerar áudio para fragmentos de vídeo.
+    Gerencia o carregamento e descarregamento de modelos de áudio da VRAM.
+    """
     def __init__(self, workspace_dir):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.cpu_device = torch.device("cpu")
         self._load_models_to_cpu()
     def _load_models_to_cpu(self):
+        """Carrega os modelos MMAudio para a memória da CPU na inicialização."""
         try:
             logger.info("Verificando e baixando modelos MMAudio, se necessário...")
             self.model_config.download_if_needed()
             self.net = None
     def to_gpu(self):
+        """Move os modelos e utilitários para a GPU antes da inferência."""
         if self.device == 'cpu': return
         logger.info(f"Movendo especialista de áudio para a GPU ({self.device})...")
         self.net.to(self.device, self.dtype)
         self.feature_utils.to(self.device, self.dtype)
     def to_cpu(self):
+        """Move os modelos de volta para a CPU e limpa a VRAM após a inferência."""
         if self.device == 'cpu': return
         logger.info("Descarregando especialista de áudio da GPU...")
         self.net.to(self.cpu_device)
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
+    def generate_audio_for_video(self, video_path: str, prompt: str, duration_seconds: float) -> str:
+        """
+        Gera áudio para um arquivo de vídeo, aplicando um prompt negativo para evitar fala.
+        Args:
+            video_path (str): Caminho para o vídeo silencioso.
+            prompt (str): Descrição da cena para guiar a geração de SFX.
+            duration_seconds (float): Duração do áudio a ser gerado.
+        Returns:
+            str: Caminho para o novo arquivo de vídeo com áudio.
+        """
         if self.net is None:
             raise gr.Error("Modelo MMAudio não está carregado. Não é possível gerar áudio.")
         logger.info(f"--- Duração: {duration_seconds:.2f}s")
         logger.info(f"--- Prompt (Descrição da Cena): '{prompt}'")
+        negative_prompt = "speech, human voice, talking, vocals, music, singing, dialogue"
+        logger.info(f"--- Negative Prompt: '{negative_prompt}'")
         if duration_seconds < 1:
             logger.warning("Fragmento muito curto (<1s). Retornando vídeo silencioso.")
             logger.info("------------------------------------------------------")
     WORKSPACE_DIR = config['application']['workspace_dir']
     audio_specialist_singleton = AudioSpecialist(workspace_dir=WORKSPACE_DIR)
 except Exception as e:
+    logger.error(f"Não foi possível inicializar o AudioSpecialist: {e}", exc_info=True)
     audio_specialist_singleton = None

prompts/sound_director_prompt.txt.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+# ROLE: AI Sound Director & Foley Artist
+# GOAL:
+You are the sound director for a film. Your task is to create a single, rich, and descriptive prompt for an audio generation model (like MMAudio). This prompt must describe the complete soundscape for the CURRENT scene, considering what happened before and what will happen next to ensure audio continuity.
+# CRITICAL RULES (MUST FOLLOW):
+1.  **NO SPEECH OR VOICES:** The final prompt must NOT include any terms related to human speech, dialogue, talking, voices, singing, or narration. The goal is to create a world of ambient sounds and specific sound effects (SFX).
+2.  **FOCUS ON THE PRESENT:** The audio must primarily match the CURRENT visual scene (Keyframe Kn) and its textual description (Ato_n).
+3.  **USE THE PAST FOR CONTINUITY:** Analyze the "Previous Audio Prompt" to understand the established soundscape. If a sound should logically continue from the previous scene, include it (e.g., "the continued sound of a gentle breeze...").
+4.  **USE THE FUTURE FOR FORESHADOWING:** Analyze the FUTURE keyframe and scene description. If appropriate, introduce subtle sounds that hint at what's to come. (e.g., if the next scene is a storm, you could add "...with the faint, distant rumble of thunder in the background.").
+5.  **BE DESCRIPTIVE:** Use evocative language. Instead of "dog bark", use "the sharp, excited yapping of a small dog". Combine multiple elements into a cohesive soundscape.
+# CONTEXT FOR YOUR DECISION:
+- **Previous Audio Prompt (what was just heard):**
+{audio_history}
+- **VISUAL PAST (Keyframe Kn-1):** [PAST_IMAGE]
+- **VISUAL PRESENT (Keyframe Kn):** [PRESENT_IMAGE]
+- **VISUAL FUTURE (Keyframe Kn+1):** [FUTURE_IMAGE]
+- **CURRENT Scene Description (Ato_n):** "{present_scene_desc}"
+- **CURRENT Motion Prompt (what the camera is doing):** "{motion_prompt}"
+- **FUTURE Scene Description (Ato_n+1):** "{future_scene_desc}"
+# RESPONSE FORMAT:
+Respond with ONLY the final, single-line prompt string for the audio generator.