""" Moduł do ekstrakcji zaawansowanych cech lingwistycznych przy użyciu spaCy. """ import re import math from collections import Counter from statistics import mean, variance from typing import Dict, List import textstat import spacy from ..utils import safe_divide from ..constants import CAMEL_CASE_PATTERN # --- Funkcje analizujące podstawowe statystyki z Doc --- def analyze_pos_stats(doc: spacy.tokens.Doc) -> Dict[str, float]: """Oblicza statystyki części mowy (POS), interpunkcji i stopwords.""" words = [t for t in doc if not t.is_punct and not t.is_space and t.pos_ != 'SYM'] words_count = len(words) if not words_count: return {'words': 0, 'nouns': 0, 'verbs': 0, 'adjectives': 0, 'adverbs': 0, 'punctuations': 0, 'symbols': 0, 'stopwords': 0, 'oovs': 0, 'pos_x': 0, 'pos_num': 0, 'noun_ratio': 0.0, 'verb_ratio': 0.0, 'adj_ratio': 0.0} stats = { 'words': words_count, 'nouns': sum(1 for t in doc if t.pos_ == "NOUN"), 'verbs': sum(1 for t in doc if t.pos_ == "VERB"), 'adjectives': sum(1 for t in doc if t.pos_ == "ADJ"), 'adverbs': sum(1 for t in doc if t.pos_ == "ADV"), 'punctuations': sum(1 for t in doc if t.is_punct), 'symbols': sum(1 for t in doc if t.pos_ == "SYM"), 'stopwords': sum(1 for t in doc if t.is_stop), 'oovs': sum(1 for t in doc if t.is_oov), 'pos_x': sum(1 for t in doc if t.pos_ == "X"), 'pos_num': sum(1 for t in doc if t.pos_ == "NUM"), } stats['noun_ratio'] = safe_divide(stats['nouns'], words_count) stats['verb_ratio'] = safe_divide(stats['verbs'], words_count) stats['adj_ratio'] = safe_divide(stats['adjectives'], words_count) return stats def analyze_doc_level_stats(doc: spacy.tokens.Doc, text: str) -> Dict[str, float]: """Analizuje cechy na poziomie całego dokumentu.""" words = [t for t in doc if not t.is_punct and not t.is_space and t.pos_ != 'SYM'] words_count = len(words) sentences_count = len(list(doc.sents)) return { 'sentences': sentences_count, 'avg_word_length': safe_divide(sum(len(t.text) for t in words), words_count), 'avg_sentence_length': safe_divide(words_count, sentences_count), 'lexical_density': safe_divide(len({t.lemma_ for t in words}), words_count), 'gunning_fog': textstat.gunning_fog(text) if text.strip() else 0.0, 'camel_case': sum(1 for t in words if CAMEL_CASE_PATTERN.match(t.text)), 'capitalized_words': sum(1 for t in words if t.text.isupper()), } # --- Funkcje analizujące zaawansowane cechy lingwistyczne --- def analyze_named_entities(doc: spacy.tokens.Doc) -> Dict[str, float]: """Analizuje rozpoznane jednostki nazwane (NER).""" alpha_words = [t for t in doc if t.is_alpha] if not alpha_words: return {"ner_count": 0, "ner_person_ratio": 0.0, "ner_org_ratio": 0.0, "ner_loc_ratio": 0.0, "ner_misc_ratio": 0.0} ents = doc.ents return { "ner_count": len(ents), "ner_person_ratio": safe_divide(sum(1 for e in ents if e.label_ == "persName"), len(alpha_words)), "ner_org_ratio": safe_divide(sum(1 for e in ents if e.label_ == "orgName"), len(alpha_words)), "ner_loc_ratio": safe_divide(sum(1 for e in ents if e.label_ in ["placeName", "locName"]), len(alpha_words)), "ner_misc_ratio": safe_divide(sum(1 for e in ents if e.label_ not in ["persName", "orgName", "placeName", "locName"]), len(alpha_words)), } def analyze_morphology(doc: spacy.tokens.Doc) -> Dict[str, float]: """Analizuje różnorodność morfologiczną.""" alpha_tokens = [t for t in doc if t.is_alpha] if not alpha_tokens: return {"case_diversity": 0.0, "tense_diversity": 0.0, "mood_diversity": 0.0} cases, tenses, moods = [], [], [] for token in alpha_tokens: if token.morph: cases.extend(token.morph.get("Case", [])) tenses.extend(token.morph.get("Tense", [])) moods.extend(token.morph.get("Mood", [])) return {"case_diversity": safe_divide(len(set(cases)), len(alpha_tokens)), "tense_diversity": safe_divide(len(set(tenses)), len(alpha_tokens)), "mood_diversity": safe_divide(len(set(moods)), len(alpha_tokens))} def analyze_dependency_complexity(doc: spacy.tokens.Doc) -> Dict[str, float]: """Oblicza średnią głębokość drzewa zależności.""" depths = [] for sent in doc.sents: if not list(sent): continue max_depth = 0 for token in sent: dist = 0 curr = token while curr.head != curr and dist < 100: curr = curr.head dist += 1 max_depth = max(max_depth, dist) depths.append(max_depth) return {"avg_dependency_tree_depth": mean(depths) if depths else 0.0} def analyze_pos_frequencies(doc: spacy.tokens.Doc, top_k=10) -> Dict[str, float]: """Analizuje częstotliwość POS dla najczęstszych słów.""" tokens = [t for t in doc if t.is_alpha] if not tokens: return {"top_words_total_count": 0, "top_words_noun_ratio": 0.0, "top_words_verb_ratio": 0.0, "top_words_adj_ratio": 0.0, "top_words_other_ratio": 0.0, "top_words_noun_prop_of_all_nouns": 0.0, "top_words_verb_prop_of_all_verbs": 0.0, "top_words_adj_prop_of_all_adjs": 0.0, "top_words_other_prop_of_all_others": 0.0} word_counts = Counter(t.text.lower() for t in tokens) top_words_list = [w for w, _ in word_counts.most_common(top_k)] top_tokens = [t for t in tokens if t.text.lower() in top_words_list] total_top_count = len(top_tokens) top_noun = sum(1 for t in top_tokens if t.pos_ == 'NOUN') top_verb = sum(1 for t in top_tokens if t.pos_ == 'VERB') top_adj = sum(1 for t in top_tokens if t.pos_ == 'ADJ') top_other = total_top_count - (top_noun + top_verb + top_adj) total_nouns = sum(1 for t in tokens if t.pos_ == "NOUN") total_verbs = sum(1 for t in tokens if t.pos_ == "VERB") total_adjs = sum(1 for t in tokens if t.pos_ == "ADJ") total_others = len(tokens) - (total_nouns + total_verbs + total_adjs) return { "top_words_total_count": total_top_count, "top_words_noun_ratio": safe_divide(top_noun, total_top_count), "top_words_verb_ratio": safe_divide(top_verb, total_top_count), "top_words_adj_ratio": safe_divide(top_adj, total_top_count), "top_words_other_ratio": safe_divide(top_other, total_top_count), "top_words_noun_prop_of_all_nouns": safe_divide(top_noun, total_nouns), "top_words_verb_prop_of_all_verbs": safe_divide(top_verb, total_verbs), "top_words_adj_prop_of_all_adjs": safe_divide(top_adj, total_adjs), "top_words_other_prop_of_all_others": safe_divide(top_other, total_others), } # --- Funkcje wymagające tylko tekstu --- def compute_readability_indices(text: str, sentences: List[str]) -> Dict[str, float]: """Oblicza wskaźniki czytelności LIX i RIX.""" if not text.strip(): return {"lix": 0.0, "rix": 0.0} words = re.findall(r'\w+', text) num_words = len(words) num_sentences = len(sentences) long_words = sum(1 for w in words if len(w) > 6) lix = safe_divide(num_words, num_sentences) + safe_divide(long_words * 100, num_words) rix = safe_divide(long_words, num_words) * 100 return {"lix": lix, "rix": rix} def analyze_polish_diacritics_distribution(text: str) -> Dict[str, float]: """Analizuje rozkład polskich znaków diakrytycznych.""" polish_diacritics = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ' total = len(text) if total == 0: return {"diacritics_std_dev": 0.0} counts = Counter(text) diac_counts = [counts[ch] for ch in polish_diacritics if ch in counts] if not diac_counts: return {"diacritics_std_dev": 0.0} diac_freqs = [c / total for c in diac_counts] mean_freq = mean(diac_freqs) variance_val = sum((x - mean_freq) ** 2 for x in diac_freqs) / len(diac_freqs) return {"diacritics_std_dev": math.sqrt(variance_val)} def analyze_question_sentences(sentences: List[str]) -> Dict[str, float]: """Oblicza stosunek zdań pytających do wszystkich.""" if not sentences: return {"question_sentence_ratio": 0.0} questions = sum(1 for s in sentences if s.strip().endswith('?')) return {"question_sentence_ratio": safe_divide(questions, len(sentences))} # --- Główna funkcja agregująca --- def calculate_all_spacy_features(doc: spacy.tokens.Doc, text: str, sentences: List[str]) -> Dict[str, float]: """Agreguje wszystkie zaawansowane cechy lingwistyczne.""" features = {} features.update(analyze_pos_stats(doc)) features.update(analyze_doc_level_stats(doc, text)) features.update(analyze_named_entities(doc)) features.update(analyze_morphology(doc)) features.update(analyze_dependency_complexity(doc)) features.update(analyze_pos_frequencies(doc)) features.update(compute_readability_indices(text, sentences)) features.update(analyze_polish_diacritics_distribution(text)) features.update(analyze_question_sentences(sentences)) return features