Aduc-sdr-cinematic-video

Runtime error

App Files Files Community

Aduc-sdr commited on Sep 4

Commit

9a0d6a9

verified ·

1 Parent(s): 5f2000a

Update managers/seedvr_manager.py

Browse files

Files changed (1) hide show

managers/seedvr_manager.py +18 -38

managers/seedvr_manager.py CHANGED Viewed

@@ -31,7 +31,6 @@ import gradio as gr
 import mediapy
 from einops import rearrange
-# Internalized utility for color correction, ensuring stability.
 from tools.tensor_utils import wavelet_reconstruction
 logger = logging.getLogger(__name__)
@@ -40,17 +39,16 @@ logger = logging.getLogger(__name__)
 DEPS_DIR = Path("./deps")
 SEEDVR_REPO_DIR = DEPS_DIR / "SeedVR"
 SEEDVR_REPO_URL = "https://github.com/ByteDance-Seed/SeedVR.git"
 def setup_seedvr_dependencies():
     """
     Ensures the SeedVR repository is cloned and available in the sys.path.
-    This function is run once when the module is first imported.
     """
     if not SEEDVR_REPO_DIR.exists():
         logger.info(f"SeedVR repository not found at '{SEEDVR_REPO_DIR}'. Cloning from GitHub...")
         try:
             DEPS_DIR.mkdir(exist_ok=True)
-            # Use --depth 1 for a shallow clone to save space and time
             subprocess.run(
                 ["git", "clone", "--depth", "1", SEEDVR_REPO_URL, str(SEEDVR_REPO_DIR)],
                 check=True, capture_output=True, text=True
@@ -62,15 +60,12 @@ def setup_seedvr_dependencies():
     else:
         logger.info("Found local SeedVR repository.")
-    # Add the cloned repo to Python's path to allow direct imports
     if str(SEEDVR_REPO_DIR.resolve()) not in sys.path:
         sys.path.insert(0, str(SEEDVR_REPO_DIR.resolve()))
         logger.info(f"Added '{SEEDVR_REPO_DIR.resolve()}' to sys.path.")
-# --- Execute dependency setup immediately upon module import ---
 setup_seedvr_dependencies()
-# --- Now that the path is set, we can safely import from the cloned repo ---
 from projects.video_diffusion_sr.infer import VideoDiffusionInfer
 from common.config import load_config
 from common.seed import set_seed
@@ -83,7 +78,6 @@ from omegaconf import OmegaConf
 def _load_file_from_url(url, model_dir='./', file_name=None):
-    """Helper function to download files from a URL to a local directory."""
     os.makedirs(model_dir, exist_ok=True)
     filename = file_name or os.path.basename(urlparse(url).path)
     cached_file = os.path.abspath(os.path.join(model_dir, filename))
@@ -103,14 +97,18 @@ class SeedVrManager:
         self.is_initialized = False
         logger.info("SeedVrManager initialized. Model will be loaded on demand.")
-    def _download_models(self):
-        """Downloads the necessary checkpoints for SeedVR2."""
-        logger.info("Verifying and downloading SeedVR2 models...")
         ckpt_dir = SEEDVR_REPO_DIR / 'ckpts'
         ckpt_dir.mkdir(exist_ok=True)
         pretrain_model_urls = {
-            'vae': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/ema_vae.pth',
             'dit_3b': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/seedvr2_ema_3b.pth',
             'dit_7b': 'https://huggingface.co/ByteDance-Seed/SeedVR2-7B/resolve/main/seedvr2_ema_7b.pth',
             'pos_emb': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/pos_emb.pt',
@@ -120,14 +118,12 @@ class SeedVrManager:
         for key, url in pretrain_model_urls.items():
             _load_file_from_url(url=url, model_dir=str(ckpt_dir))
-        logger.info("SeedVR2 models downloaded successfully.")
     def _initialize_runner(self, model_version: str):
         """Loads and configures the SeedVR model on demand based on the selected version."""
         if self.runner is not None: return
-        self._download_models()
         logger.info(f"Initializing SeedVR2 {model_version} runner...")
         if model_version == '3B':
             config_path = SEEDVR_REPO_DIR / 'configs_3b' / 'main.yaml'
@@ -139,11 +135,15 @@ class SeedVrManager:
             raise ValueError(f"Unsupported SeedVR model version: {model_version}")
         config = load_config(str(config_path))
         self.runner = VideoDiffusionInfer(config)
         OmegaConf.set_readonly(self.runner.config, False)
         self.runner.configure_dit_model(device=self.device, checkpoint=str(checkpoint_path))
         self.runner.configure_vae_model()
         if hasattr(self.runner.vae, "set_memory_limit"):
@@ -153,7 +153,6 @@ class SeedVrManager:
         logger.info(f"Runner for SeedVR2 {model_version} initialized and ready.")
     def _unload_runner(self):
-        """Removes the runner from VRAM to free resources."""
         if self.runner is not None:
             del self.runner; self.runner = None
             gc.collect(); torch.cuda.empty_cache()
@@ -163,17 +162,13 @@ class SeedVrManager:
     def process_video(self, input_video_path: str, output_video_path: str, prompt: str,
                       model_version: str = '3B', steps: int = 50, seed: int = 666,
                       progress: gr.Progress = None) -> str:
-        """Applies HD enhancement to a video using the SeedVR logic."""
         try:
             self._initialize_runner(model_version)
             set_seed(seed, same_across_ranks=True)
             self.runner.config.diffusion.timesteps.sampling.steps = steps
             self.runner.configure_diffusion()
             video_tensor = read_video(input_video_path, output_format="TCHW")[0] / 255.0
             res_h, res_w = video_tensor.shape[-2:]
             video_transform = Compose([
                 NaResize(resolution=(res_h * res_w) ** 0.5, mode="area", downsample_only=False),
                 Lambda(lambda x: torch.clamp(x, 0.0, 1.0)),
@@ -181,48 +176,33 @@ class SeedVrManager:
                 Normalize(0.5, 0.5),
                 Rearrange("t c h w -> c t h w"),
             ])
             cond_latents = [video_transform(video_tensor.to(self.device))]
             input_videos = cond_latents
             self.runner.dit.to("cpu")
             self.runner.vae.to(self.device)
             cond_latents = self.runner.vae_encode(cond_latents)
             self.runner.vae.to("cpu"); gc.collect(); torch.cuda.empty_cache()
             self.runner.dit.to(self.device)
             pos_emb_path = SEEDVR_REPO_DIR / 'ckpts' / 'pos_emb.pt'
             neg_emb_path = SEEDVR_REPO_DIR / 'ckpts' / 'neg_emb.pt'
             text_pos_embeds = torch.load(pos_emb_path).to(self.device)
             text_neg_embeds = torch.load(neg_emb_path).to(self.device)
             text_embeds_dict = {"texts_pos": [text_pos_embeds], "texts_neg": [text_neg_embeds]}
             noises = [torch.randn_like(latent) for latent in cond_latents]
             conditions = [self.runner.get_condition(noise, latent_blur=latent, task="sr") for noise, latent in zip(noises, cond_latents)]
             with torch.no_grad(), torch.autocast("cuda", torch.bfloat16, enabled=True):
                 video_tensors = self.runner.inference(noises=noises, conditions=conditions, dit_offload=True, **text_embeds_dict)
             self.runner.dit.to("cpu"); gc.collect(); torch.cuda.empty_cache()
             self.runner.vae.to(self.device)
             samples = self.runner.vae_decode(video_tensors)
             final_sample = samples[0]
             input_video_sample = input_videos[0]
             if final_sample.shape[1] < input_video_sample.shape[1]:
                 input_video_sample = input_video_sample[:, :final_sample.shape[1]]
-            final_sample = wavelet_reconstruction(
-                rearrange(final_sample, "c t h w -> t c h w"),
-                rearrange(input_video_sample, "c t h w -> t c h w")
-            )
             final_sample = rearrange(final_sample, "t c h w -> t h w c")
             final_sample = final_sample.clip(-1, 1).mul_(0.5).add_(0.5).mul_(255).round()
             final_sample_np = final_sample.to(torch.uint8).cpu().numpy()
             mediapy.write_video(output_video_path, final_sample_np, fps=24)
             logger.info(f"HD Mastered video saved to: {output_video_path}")
             return output_video_path

 import mediapy
 from einops import rearrange
 from tools.tensor_utils import wavelet_reconstruction
 logger = logging.getLogger(__name__)
 DEPS_DIR = Path("./deps")
 SEEDVR_REPO_DIR = DEPS_DIR / "SeedVR"
 SEEDVR_REPO_URL = "https://github.com/ByteDance-Seed/SeedVR.git"
+VAE_CONFIG_URL = "https://raw.githubusercontent.com/ByteDance-Seed/SeedVR/main/models/video_vae_v3/s8_c16_t4_inflation_sd3.yaml"
 def setup_seedvr_dependencies():
     """
     Ensures the SeedVR repository is cloned and available in the sys.path.
     """
     if not SEEDVR_REPO_DIR.exists():
         logger.info(f"SeedVR repository not found at '{SEEDVR_REPO_DIR}'. Cloning from GitHub...")
         try:
             DEPS_DIR.mkdir(exist_ok=True)
             subprocess.run(
                 ["git", "clone", "--depth", "1", SEEDVR_REPO_URL, str(SEEDVR_REPO_DIR)],
                 check=True, capture_output=True, text=True
     else:
         logger.info("Found local SeedVR repository.")
     if str(SEEDVR_REPO_DIR.resolve()) not in sys.path:
         sys.path.insert(0, str(SEEDVR_REPO_DIR.resolve()))
         logger.info(f"Added '{SEEDVR_REPO_DIR.resolve()}' to sys.path.")
 setup_seedvr_dependencies()
 from projects.video_diffusion_sr.infer import VideoDiffusionInfer
 from common.config import load_config
 from common.seed import set_seed
 def _load_file_from_url(url, model_dir='./', file_name=None):
     os.makedirs(model_dir, exist_ok=True)
     filename = file_name or os.path.basename(urlparse(url).path)
     cached_file = os.path.abspath(os.path.join(model_dir, filename))
         self.is_initialized = False
         logger.info("SeedVrManager initialized. Model will be loaded on demand.")
+    def _download_models_and_configs(self):
+        """Downloads the necessary checkpoints AND the missing VAE config file."""
+        logger.info("Verifying and downloading SeedVR2 models and configs...")
         ckpt_dir = SEEDVR_REPO_DIR / 'ckpts'
+        config_dir = SEEDVR_REPO_DIR / 'configs' / 'vae'
         ckpt_dir.mkdir(exist_ok=True)
+        config_dir.mkdir(parents=True, exist_ok=True)
+        _load_file_from_url(url=VAE_CONFIG_URL, model_dir=str(config_dir))
         pretrain_model_urls = {
+            'vae_ckpt': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/ema_vae.pth',
             'dit_3b': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/seedvr2_ema_3b.pth',
             'dit_7b': 'https://huggingface.co/ByteDance-Seed/SeedVR2-7B/resolve/main/seedvr2_ema_7b.pth',
             'pos_emb': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/pos_emb.pt',
         for key, url in pretrain_model_urls.items():
             _load_file_from_url(url=url, model_dir=str(ckpt_dir))
+        logger.info("SeedVR2 models and configs downloaded successfully.")
     def _initialize_runner(self, model_version: str):
         """Loads and configures the SeedVR model on demand based on the selected version."""
         if self.runner is not None: return
+        self._download_models_and_configs()
         logger.info(f"Initializing SeedVR2 {model_version} runner...")
         if model_version == '3B':
             config_path = SEEDVR_REPO_DIR / 'configs_3b' / 'main.yaml'
             raise ValueError(f"Unsupported SeedVR model version: {model_version}")
         config = load_config(str(config_path))
         self.runner = VideoDiffusionInfer(config)
         OmegaConf.set_readonly(self.runner.config, False)
         self.runner.configure_dit_model(device=self.device, checkpoint=str(checkpoint_path))
+        # --- PATH CORRECTION ---
+        correct_vae_config_path = SEEDVR_REPO_DIR / 'configs' / 'vae' / 's8_c16_t4_inflation_sd3.yaml'
+        logger.info(f"Correcting VAE config path to: {correct_vae_config_path}")
+        self.runner.config.vae.config = str(correct_vae_config_path)
         self.runner.configure_vae_model()
         if hasattr(self.runner.vae, "set_memory_limit"):
         logger.info(f"Runner for SeedVR2 {model_version} initialized and ready.")
     def _unload_runner(self):
         if self.runner is not None:
             del self.runner; self.runner = None
             gc.collect(); torch.cuda.empty_cache()
     def process_video(self, input_video_path: str, output_video_path: str, prompt: str,
                       model_version: str = '3B', steps: int = 50, seed: int = 666,
                       progress: gr.Progress = None) -> str:
         try:
             self._initialize_runner(model_version)
             set_seed(seed, same_across_ranks=True)
             self.runner.config.diffusion.timesteps.sampling.steps = steps
             self.runner.configure_diffusion()
             video_tensor = read_video(input_video_path, output_format="TCHW")[0] / 255.0
             res_h, res_w = video_tensor.shape[-2:]
             video_transform = Compose([
                 NaResize(resolution=(res_h * res_w) ** 0.5, mode="area", downsample_only=False),
                 Lambda(lambda x: torch.clamp(x, 0.0, 1.0)),
                 Normalize(0.5, 0.5),
                 Rearrange("t c h w -> c t h w"),
             ])
             cond_latents = [video_transform(video_tensor.to(self.device))]
             input_videos = cond_latents
             self.runner.dit.to("cpu")
             self.runner.vae.to(self.device)
             cond_latents = self.runner.vae_encode(cond_latents)
             self.runner.vae.to("cpu"); gc.collect(); torch.cuda.empty_cache()
             self.runner.dit.to(self.device)
             pos_emb_path = SEEDVR_REPO_DIR / 'ckpts' / 'pos_emb.pt'
             neg_emb_path = SEEDVR_REPO_DIR / 'ckpts' / 'neg_emb.pt'
             text_pos_embeds = torch.load(pos_emb_path).to(self.device)
             text_neg_embeds = torch.load(neg_emb_path).to(self.device)
             text_embeds_dict = {"texts_pos": [text_pos_embeds], "texts_neg": [text_neg_embeds]}
             noises = [torch.randn_like(latent) for latent in cond_latents]
             conditions = [self.runner.get_condition(noise, latent_blur=latent, task="sr") for noise, latent in zip(noises, cond_latents)]
             with torch.no_grad(), torch.autocast("cuda", torch.bfloat16, enabled=True):
                 video_tensors = self.runner.inference(noises=noises, conditions=conditions, dit_offload=True, **text_embeds_dict)
             self.runner.dit.to("cpu"); gc.collect(); torch.cuda.empty_cache()
             self.runner.vae.to(self.device)
             samples = self.runner.vae_decode(video_tensors)
             final_sample = samples[0]
             input_video_sample = input_videos[0]
             if final_sample.shape[1] < input_video_sample.shape[1]:
                 input_video_sample = input_video_sample[:, :final_sample.shape[1]]
+            final_sample = wavelet_reconstruction(rearrange(final_sample, "c t h w -> t c h w"), rearrange(input_video_sample, "c t h w -> t c h w"))
             final_sample = rearrange(final_sample, "t c h w -> t h w c")
             final_sample = final_sample.clip(-1, 1).mul_(0.5).add_(0.5).mul_(255).round()
             final_sample_np = final_sample.to(torch.uint8).cpu().numpy()
             mediapy.write_video(output_video_path, final_sample_np, fps=24)
             logger.info(f"HD Mastered video saved to: {output_video_path}")
             return output_video_path