Aduc-sdr-cinematic-video

Runtime error

App Files Files Community

Aduc-sdr-cinematic-video / aduc_framework /managers /llama_scout_manager.py

Carlex22222

Update aduc_framework/managers/llama_scout_manager.py

07a4823 verified 2 months ago

raw

history blame

6.65 kB

	# aduc_framework/managers/llama_scout_manager.py
	#
	# Copyright (C) August 4, 2025 Carlos Rodrigues dos Santos
	#
	# Versão 1.0.0 (Llama-4-Scout Multimodal Specialist)
	#
	# Este manager implementa um especialista multimodal de ponta, capaz de
	# raciocinar sobre imagens e sequências de frames de vídeo para fornecer
	# análises complexas.

	import torch
	import logging
	import yaml
	import imageio
	import numpy as np
	from PIL import Image
	from transformers import AutoProcessor, AutoModelForCausalLM
	import os
	import gc
	from typing import List

	from ..tools.hardware_manager import hardware_manager

	logger = logging.getLogger(__name__)

	class LlamaScoutManager:
	"""
	Especialista Multimodal que utiliza o Llama-4-Scout para análise visual.
	Mantém o modelo "quente" na GPU para performance máxima.
	"""
	MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

	def __init__(self, device_id: str):
	self.device = torch.device(device_id)
	self.dtype = torch.bfloat16 if 'cuda' in self.device.type and torch.cuda.is_bf16_supported() else torch.float16

	self.processor = None
	self.model = None

	self._initialize_and_warm_up_model()

	def _initialize_and_warm_up_model(self):
	"""
	Baixa, carrega e move o modelo para a GPU imediatamente no startup.
	"""
	if self.model is not None:
	return
	try:
	logger.info(f"INICIALIZAÇÃO QUENTE: Carregando MLLM '{self.MODEL_NAME}' para a GPU {self.device}...")
	self.processor = AutoProcessor.from_pretrained(self.MODEL_NAME)
	self.model = AutoModelForCausalLM.from_pretrained(
	self.MODEL_NAME,
	torch_dtype=self.dtype,
	trust_remote_code=True # Necessário para modelos novos
	).to(self.device)
	logger.info(f"Modelo MLLM '{self.MODEL_NAME}' 'quente' e pronto na GPU {self.device}.")
	except Exception as e:
	logger.error(f"Falha CRÍTICA ao carregar o MLLM: {e}", exc_info=True)
	self.model = None

	def _cleanup_gpu_cache(self):
	"""Limpa o cache da VRAM após uma inferência."""
	if self.device.type == 'cuda':
	gc.collect()
	torch.cuda.empty_cache()

	def answer_on_image(self, image_path: str, question: str) -> str:
	"""Responde a uma pergunta baseada em uma única imagem."""
	if self.model is None:
	return "Error: Multimodal model not initialized. Check logs."

	try:
	image = Image.open(image_path).convert("RGB")

	prompt = f"<\|user\|>\n<\|image_1\|>\n{question}<\|end\|>\n<\|assistant\|>\n"

	inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.device, self.dtype)

	with torch.no_grad():
	generated_ids = self.model.generate(**inputs, max_new_tokens=200, do_sample=False)

	# Decodifica a resposta completa e depois extrai apenas a parte do assistente
	full_response = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
	clean_response = full_response.split("<\|assistant\|>")[1].strip()

	logger.info(f"Question: '{question}' \| Image: '{image_path}' -> Answer: '{clean_response}'")
	return clean_response

	except Exception as e:
	logger.error(f"Error processing MLLM for image {image_path}: {e}", exc_info=True)
	return f"Error analyzing the image: {e}"
	finally:
	self._cleanup_gpu_cache()

	def analyze_video_movement(self, video_path: str) -> str:
	"""
	Analisa um vídeo extraindo frames e pedindo ao MLLM para descrever o movimento.
	"""
	logger.info(f"Iniciando ANÁLISE DE MOVIMENTO para o vídeo '{video_path}'...")
	try:
	frames = []

	with imageio.get_reader(video_path, 'ffmpeg') as reader:
	meta_data = reader.get_meta_data()
	total_frames = meta_data.get('nframes', reader.count_frames())
	if total_frames < 8:
	return "Video is too short for movement analysis."

	indices_to_sample = list(range(0, total_frames, 8))
	frames = [Image.fromarray(reader.get_data(i)) for i in indices_to_sample]

	# Constrói o prompt multimodal com múltiplas imagens
	prompt_text = (
	"<\|user\|>\n"
	+ "".join([f"<\|image_{i+1}\|>\n" for i in range(len(frames))])
	+ "You are a film analyst. The images above are sequential frames from a video, sampled at regular intervals. "
	"Describe the movement, action, and narrative that unfolds across this sequence. Focus on what happens between the frames.<\|end\|>\n"
	"<\|assistant\|>\n"
	)

	inputs = self.processor(text=prompt_text, images=frames, return_tensors="pt").to(self.device, self.dtype)

	with torch.no_grad():
	generated_ids = self.model.generate(**inputs, max_new_tokens=400, do_sample=False)

	full_response = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
	analysis = full_response.split("<\|assistant\|>")[1].strip()

	logger.info(f"Video '{video_path}' analysis complete. Result: '{analysis[:100]}...'")
	return analysis

	except Exception as e:
	logger.error(f"Error analyzing video movement for {video_path}: {e}", exc_info=True)
	return f"Error during video movement analysis: {e}"
	finally:
	self._cleanup_gpu_cache()

	# --- Instanciação Singleton ---
	try:
	with open("config.yaml", 'r') as f: config = yaml.safe_load(f)
	# Renomeamos a seção no config para refletir o novo modelo
	gpus_required = config['specialists'].get('llama_scout', {}).get('gpus_required', 1)
	device_ids = hardware_manager.allocate_gpus('LlamaScout', gpus_required)

	llama_scout_manager_singleton = LlamaScoutManager(device_id=device_ids[0])

	except Exception as e:
	logger.critical(f"Could not initialize LlamaScoutManager: {e}. Using a placeholder.", exc_info=True)
	class LlamaScoutPlaceholder:
	def answer_on_image(self, args, *kwargs): return "Error: LlamaScout Specialist not initialized."
	def analyze_video_movement(self, args, *kwargs): return "Error: LlamaScout Specialist not initialized."
	llama_scout_manager_singleton = LlamaScoutPlaceholder()