|
|
|
|
|
""" |
|
|
Test multilingual summarization functionality. |
|
|
Tests summarization and title generation across different languages. |
|
|
""" |
|
|
|
|
|
import sys |
|
|
import os |
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
|
|
|
from src.summarization import summarize_transcript, generate_title |
|
|
from src.utils import available_gguf_llms |
|
|
|
|
|
|
|
|
TEST_TRANSCRIPTS = { |
|
|
"english": """ |
|
|
Hello everyone, today we're going to discuss artificial intelligence and its impact on modern society. |
|
|
AI has become increasingly important in our daily lives, from voice assistants like Siri and Alexa, |
|
|
to recommendation systems on Netflix and YouTube. The technology is advancing rapidly, with machine |
|
|
learning algorithms becoming more sophisticated every day. However, we must also consider the ethical |
|
|
implications of AI development, including privacy concerns, job displacement, and the potential for bias |
|
|
in automated decision-making systems. It's crucial that we develop AI responsibly to ensure it benefits |
|
|
all of humanity rather than just a select few. |
|
|
""", |
|
|
|
|
|
"french": """ |
|
|
Bonjour à tous, aujourd'hui nous allons discuter de l'intelligence artificielle et de son impact sur la société moderne. |
|
|
L'IA est devenue de plus en plus importante dans notre vie quotidienne, des assistants vocaux comme Siri et Alexa, |
|
|
aux systèmes de recommandation sur Netflix et YouTube. La technologie progresse rapidement, avec des algorithmes |
|
|
d'apprentissage automatique devenant plus sophistiqués chaque jour. Cependant, nous devons également considérer |
|
|
les implications éthiques du développement de l'IA, y compris les préoccupations de confidentialité, le déplacement |
|
|
d'emplois, et le potentiel de biais dans les systèmes de prise de décision automatisée. Il est crucial que nous |
|
|
développions l'IA de manière responsable pour assurer qu'elle bénéficie à toute l'humanité plutôt qu'à une élite. |
|
|
""", |
|
|
|
|
|
"spanish": """ |
|
|
Hola a todos, hoy vamos a discutir sobre la inteligencia artificial y su impacto en la sociedad moderna. |
|
|
La IA se ha vuelto cada vez más importante en nuestra vida diaria, desde asistentes de voz como Siri y Alexa, |
|
|
hasta sistemas de recomendación en Netflix y YouTube. La tecnología está avanzando rápidamente, con algoritmos |
|
|
de aprendizaje automático volviéndose más sofisticados cada día. Sin embargo, también debemos considerar |
|
|
las implicaciones éticas del desarrollo de la IA, incluyendo preocupaciones de privacidad, desplazamiento |
|
|
laboral, y el potencial de sesgo en sistemas de toma de decisiones automatizada. Es crucial que desarrollemos |
|
|
la IA de manera responsable para asegurar que beneficie a toda la humanidad en lugar de solo a unos pocos. |
|
|
""", |
|
|
|
|
|
"german": """ |
|
|
Hallo zusammen, heute werden wir über künstliche Intelligenz und ihre Auswirkungen auf die moderne Gesellschaft sprechen. |
|
|
KI ist in unserem täglichen Leben immer wichtiger geworden, von Sprachassistenten wie Siri und Alexa, |
|
|
bis hin zu Empfehlungssystemen auf Netflix und YouTube. Die Technologie entwickelt sich rasant weiter, mit |
|
|
maschinellen Lernalgorithmen, die jeden Tag ausgefeilter werden. Allerdings müssen wir auch die ethischen |
|
|
Implikationen der KI-Entwicklung berücksichtigen, einschließlich Datenschutzbedenken, Arbeitsplatzverlust |
|
|
und das Potenzial für Voreingenommenheit in automatisierten Entscheidungssystemen. Es ist entscheidend, dass |
|
|
wir KI verantwortungsvoll entwickeln, um sicherzustellen, dass sie der gesamten Menschheit zugutekommt und nicht nur wenigen. |
|
|
""", |
|
|
|
|
|
"chinese": """ |
|
|
大家好,今天我们来讨论人工智能及其对现代社会的影响。 |
|
|
人工智能已经在我们的日常生活中变得越来越重要,从Siri和Alexa这样的语音助手, |
|
|
到Netflix和YouTube上的推荐系统。技术正在快速发展,机器学习算法每天都变得更加复杂。 |
|
|
然而,我们也必须考虑人工智能发展的伦理含义,包括隐私问题、就业流失, |
|
|
以及自动化决策系统中潜在的偏见。至关重要的是,我们要负责任地开发人工智能, |
|
|
以确保它惠及全人类而不是少数人。 |
|
|
""", |
|
|
|
|
|
"japanese": """ |
|
|
皆さんこんにちは、今日は人工知能とそれが現代社会に与える影響について議論します。 |
|
|
AIは私たちの日常生活でますます重要になっています。SiriやAlexaのような音声アシスタントから、 |
|
|
NetflixやYouTubeの推薦システムまで。技術は急速に進化しており、機械学習アルゴリズムは |
|
|
日々より洗練されたものになっています。しかし、私たちはAI開発の倫理的影響も考慮しなければなりません。 |
|
|
プライバシーの懸念、雇用の喪失、自動化された意思決定システムにおける偏りの可能性を含めて。 |
|
|
私たちはAIを責任を持って開発し、それが少数の人々ではなく全人類に利益をもたらすことを確実にすることが重要です。 |
|
|
""", |
|
|
|
|
|
"arabic": """ |
|
|
مرحباً بكم جميعاً، اليوم سنناقش الذكاء الاصطناعي وتأثيره على المجتمع الحديث. |
|
|
أصبح الذكاء الاصطناعي أكثر أهمية في حياتنا اليومية، من المساعدين الصوتيين مثل Siri وAlexa، |
|
|
إلى أنظمة التوصية على Netflix وYouTube. التكنولوجيا تتطور بسرعة، مع خوارزميات التعلم الآلي |
|
|
التي تصبح أكثر تعقيداً كل يوم. ومع ذلك، يجب أن نعتبر أيضاً الآثار الأخلاقية لتطوير الذكاء الاصطناعي، |
|
|
بما في ذلك مخاوف الخصوصية، والتهجير الوظيفي، وإمكانية التحيز في أنظمة اتخاذ القرارات الآلية. |
|
|
من المهم أن نطور الذكاء الاصطناعي بمسؤولية لضمان أنه يفيد البشرية جمعاء وليس فئة قليلة فقط. |
|
|
""" |
|
|
} |
|
|
|
|
|
def test_multilingual_summarization(): |
|
|
"""Test summarization in multiple languages""" |
|
|
print("Testing multilingual summarization...") |
|
|
|
|
|
|
|
|
model_name = list(available_gguf_llms.keys())[0] |
|
|
print(f"Using model: {model_name}") |
|
|
|
|
|
for language, transcript in TEST_TRANSCRIPTS.items(): |
|
|
print(f"\n--- Testing {language.upper()} ---") |
|
|
print(f"Original transcript length: {len(transcript)} characters") |
|
|
|
|
|
try: |
|
|
|
|
|
title = generate_title(transcript, model_name) |
|
|
print(f"Generated title: {title}") |
|
|
|
|
|
|
|
|
summary_parts = list(summarize_transcript(transcript, model_name, "Summarize this transcript")) |
|
|
summary = ''.join(summary_parts) |
|
|
print(f"Generated summary: {summary[:200]}..." if len(summary) > 200 else f"Generated summary: {summary}") |
|
|
|
|
|
|
|
|
|
|
|
if language == "english": |
|
|
|
|
|
english_words = sum(1 for word in summary.split() if word.isascii() and len(word) > 2) |
|
|
total_words = len(summary.split()) |
|
|
if total_words > 0: |
|
|
english_ratio = english_words / total_words |
|
|
print(".2f") |
|
|
if english_ratio < 0.3: |
|
|
print(f"⚠️ WARNING: Low English ratio detected ({english_ratio:.2f})") |
|
|
|
|
|
elif language == "chinese": |
|
|
|
|
|
chinese_chars = sum(1 for char in summary if '\u4e00' <= char <= '\u9fff') |
|
|
if len(summary) > 0: |
|
|
chinese_ratio = chinese_chars / len(summary) |
|
|
print(".2f") |
|
|
if chinese_ratio < 0.1: |
|
|
print(f"⚠️ WARNING: Low Chinese character ratio detected ({chinese_ratio:.2f})") |
|
|
|
|
|
elif language == "japanese": |
|
|
|
|
|
japanese_chars = sum(1 for char in summary if ('\u3040' <= char <= '\u309f') or ('\u30a0' <= char <= '\u30ff') or ('\u4e00' <= char <= '\u9fff')) |
|
|
if len(summary) > 0: |
|
|
japanese_ratio = japanese_chars / len(summary) |
|
|
print(".2f") |
|
|
if japanese_ratio < 0.1: |
|
|
print(f"⚠️ WARNING: Low Japanese character ratio detected ({japanese_ratio:.2f})") |
|
|
|
|
|
elif language == "arabic": |
|
|
|
|
|
arabic_chars = sum(1 for char in summary if '\u0600' <= char <= '\u06ff') |
|
|
if len(summary) > 0: |
|
|
arabic_ratio = arabic_chars / len(summary) |
|
|
print(".2f") |
|
|
if arabic_ratio < 0.1: |
|
|
print(f"⚠️ WARNING: Low Arabic character ratio detected ({arabic_ratio:.2f})") |
|
|
|
|
|
print("✅ Test passed") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Test failed for {language}: {e}") |
|
|
|
|
|
def test_language_consistency(): |
|
|
"""Test that titles and summaries maintain language consistency""" |
|
|
print("\n\nTesting language consistency between titles and summaries...") |
|
|
|
|
|
model_name = list(available_gguf_llms.keys())[0] |
|
|
|
|
|
for language, transcript in TEST_TRANSCRIPTS.items(): |
|
|
print(f"\n--- Testing consistency for {language.upper()} ---") |
|
|
|
|
|
try: |
|
|
title = generate_title(transcript, model_name) |
|
|
summary_parts = list(summarize_transcript(transcript, model_name, "Summarize this transcript")) |
|
|
summary = ''.join(summary_parts) |
|
|
|
|
|
|
|
|
if not title or not summary: |
|
|
print("❌ Empty title or summary generated") |
|
|
continue |
|
|
|
|
|
|
|
|
def get_language_chars(text): |
|
|
"""Get ratio of language-specific characters""" |
|
|
if language == "chinese": |
|
|
return sum(1 for char in text if '\u4e00' <= char <= '\u9fff') |
|
|
elif language == "japanese": |
|
|
return sum(1 for char in text if ('\u3040' <= char <= '\u309f') or ('\u30a0' <= char <= '\u30ff') or ('\u4e00' <= char <= '\u9fff')) |
|
|
elif language == "arabic": |
|
|
return sum(1 for char in text if '\u0600' <= char <= '\u06ff') |
|
|
else: |
|
|
return sum(1 for char in text if char.isascii()) |
|
|
|
|
|
title_chars = get_language_chars(title) |
|
|
summary_chars = get_language_chars(summary) |
|
|
|
|
|
title_ratio = title_chars / len(title) if len(title) > 0 else 0 |
|
|
summary_ratio = summary_chars / len(summary) if len(summary) > 0 else 0 |
|
|
|
|
|
ratio_diff = abs(title_ratio - summary_ratio) |
|
|
print(f"Language character ratios - Title: {title_ratio:.2f}, Summary: {summary_ratio:.2f}") |
|
|
if ratio_diff > 0.3: |
|
|
print(f"⚠️ WARNING: Large difference in language character ratios ({ratio_diff:.2f})") |
|
|
else: |
|
|
print("✅ Language consistency maintained") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Consistency test failed for {language}: {e}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Consistency test failed for {language}: {e}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("Starting multilingual summarization tests...") |
|
|
print("=" * 60) |
|
|
|
|
|
test_multilingual_summarization() |
|
|
test_language_consistency() |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Multilingual tests completed!") |