Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| from transformers import ( | |
| BlipProcessor, | |
| BlipForConditionalGeneration, | |
| AutoTokenizer, | |
| AutoModelForSeq2SeqLM | |
| ) | |
| from typing import Union | |
| from gtts import gTTS | |
| import os | |
| import uuid | |
| import time | |
| import gc | |
| torch.set_num_threads(2) | |
| _pipeline = None | |
| def init_pipeline(): | |
| global _pipeline | |
| if _pipeline is None: | |
| _pipeline = ImageCaptionPipeline() | |
| return _pipeline | |
| class ImageCaptionPipeline: | |
| def __init__(self): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| start_time = time.time() | |
| self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large", use_fast=True) | |
| self.blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(self.device) | |
| print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд") | |
| start_time = time.time() | |
| self.translator_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru") | |
| self.translator_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru").to(self.device) | |
| print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд") | |
| def generate_captions(self, image: Union[str, Image.Image]) -> tuple: | |
| start_time = time.time() | |
| if isinstance(image, str): | |
| image = Image.open(image) | |
| image = image.convert("RGB") | |
| image = image.resize((512, 512), Image.Resampling.LANCZOS) | |
| inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device) | |
| with torch.no_grad(): | |
| output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True) | |
| english_caption = self.blip_processor.decode(output_ids[0], skip_special_tokens=True) | |
| print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд") | |
| start_time = time.time() | |
| translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device) | |
| with torch.no_grad(): | |
| translated_ids = self.translator_model.generate( | |
| **translated_inputs, | |
| max_length=50, | |
| num_beams=2, | |
| early_stopping=True | |
| ) | |
| russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True) | |
| print(f"Время перевода на русский: {time.time() - start_time:.2f} секунд") | |
| gc.collect() | |
| return english_caption, russian_caption | |
| def generate_audio(self, text: str, language: str) -> str: | |
| start_time = time.time() | |
| lang_code = "ru" if language == "Русский" else "en" | |
| tts = gTTS(text=text, lang=lang_code) | |
| audio_path = f"caption_audio_{uuid.uuid4()}.mp3" | |
| tts.save(audio_path) | |
| print(f"Время генерации озвучки: {time.time() - start_time:.2f} секунд") | |
| return audio_path | |
| def generate_captions(image: Image.Image) -> tuple: | |
| if image is not None: | |
| pipeline = init_pipeline() | |
| english_caption, russian_caption = pipeline.generate_captions(image) | |
| return english_caption, russian_caption, None | |
| return "Загрузите изображение.", "Загрузите изображение.", None | |
| def generate_audio(english_caption: str, russian_caption: str, audio_language: str) -> str: | |
| if not english_caption and not russian_caption: | |
| return None | |
| pipeline = init_pipeline() | |
| text = russian_caption if audio_language == "Русский" else english_caption | |
| return pipeline.generate_audio(text, audio_language) | |
| with gr.Blocks(css=""" | |
| .btn { | |
| width: 200px; | |
| background-color: #4B0082; | |
| color: white; | |
| font-size: 16px; | |
| } | |
| .equal-height { | |
| height: 100px !important; | |
| } | |
| """) as iface: | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=400, variant="panel"): | |
| with gr.Row(): | |
| image = gr.Image(type="pil", label="Изображение", height=400, width=400) | |
| with gr.Row(): | |
| submit_button = gr.Button("Сгенерировать описание", elem_classes="btn") | |
| with gr.Column(scale=1, variant="panel"): | |
| with gr.Row(): | |
| english_caption = gr.Textbox(label="Английский язык:", lines=1, interactive=False) | |
| russian_caption = gr.Textbox(label="Русский язык:", lines=1, interactive=False) | |
| with gr.Row(): | |
| audio_language = gr.Dropdown( | |
| choices=["Русский", "English"], | |
| label="Язык озвучки", | |
| value="Русский", | |
| elem_classes="equal-height" | |
| ) | |
| audio_output = gr.Audio( | |
| label="Озвучка", | |
| elem_classes="equal-height" | |
| ) | |
| with gr.Row(): | |
| audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn") | |
| submit_button.click( | |
| fn=generate_captions, | |
| inputs=[image], | |
| outputs=[english_caption, russian_caption] | |
| ) | |
| audio_button.click( | |
| fn=generate_audio, | |
| inputs=[english_caption, russian_caption, audio_language], | |
| outputs=[audio_output] | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |
| # Пум-пуммм.. |