File size: 9,487 Bytes

5c8f9d2

"""

Moduł do ekstrakcji zaawansowanych cech lingwistycznych przy użyciu spaCy.

"""
import re
import math
from collections import Counter
from statistics import mean, variance
from typing import Dict, List

import textstat
import spacy

from ..utils import safe_divide
from ..constants import CAMEL_CASE_PATTERN

# --- Funkcje analizujące podstawowe statystyki z Doc ---

def analyze_pos_stats(doc: spacy.tokens.Doc) -> Dict[str, float]:
    """Oblicza statystyki części mowy (POS), interpunkcji i stopwords."""
    words = [t for t in doc if not t.is_punct and not t.is_space and t.pos_ != 'SYM']
    words_count = len(words)
    
    if not words_count:
        return {'words': 0, 'nouns': 0, 'verbs': 0, 'adjectives': 0, 'adverbs': 0,
                'punctuations': 0, 'symbols': 0, 'stopwords': 0, 'oovs': 0,
                'pos_x': 0, 'pos_num': 0, 'noun_ratio': 0.0, 'verb_ratio': 0.0,
                'adj_ratio': 0.0}
    
    stats = {
        'words': words_count, 'nouns': sum(1 for t in doc if t.pos_ == "NOUN"),
        'verbs': sum(1 for t in doc if t.pos_ == "VERB"),
        'adjectives': sum(1 for t in doc if t.pos_ == "ADJ"),
        'adverbs': sum(1 for t in doc if t.pos_ == "ADV"),
        'punctuations': sum(1 for t in doc if t.is_punct),
        'symbols': sum(1 for t in doc if t.pos_ == "SYM"),
        'stopwords': sum(1 for t in doc if t.is_stop),
        'oovs': sum(1 for t in doc if t.is_oov),
        'pos_x': sum(1 for t in doc if t.pos_ == "X"),
        'pos_num': sum(1 for t in doc if t.pos_ == "NUM"),
    }
    stats['noun_ratio'] = safe_divide(stats['nouns'], words_count)
    stats['verb_ratio'] = safe_divide(stats['verbs'], words_count)
    stats['adj_ratio'] = safe_divide(stats['adjectives'], words_count)
    return stats

def analyze_doc_level_stats(doc: spacy.tokens.Doc, text: str) -> Dict[str, float]:
    """Analizuje cechy na poziomie całego dokumentu."""
    words = [t for t in doc if not t.is_punct and not t.is_space and t.pos_ != 'SYM']
    words_count = len(words)
    sentences_count = len(list(doc.sents))
    
    return {
        'sentences': sentences_count,
        'avg_word_length': safe_divide(sum(len(t.text) for t in words), words_count),
        'avg_sentence_length': safe_divide(words_count, sentences_count),
        'lexical_density': safe_divide(len({t.lemma_ for t in words}), words_count),
        'gunning_fog': textstat.gunning_fog(text) if text.strip() else 0.0,
        'camel_case': sum(1 for t in words if CAMEL_CASE_PATTERN.match(t.text)),
        'capitalized_words': sum(1 for t in words if t.text.isupper()),
    }

# --- Funkcje analizujące zaawansowane cechy lingwistyczne ---

def analyze_named_entities(doc: spacy.tokens.Doc) -> Dict[str, float]:
    """Analizuje rozpoznane jednostki nazwane (NER)."""
    alpha_words = [t for t in doc if t.is_alpha]
    if not alpha_words:
        return {"ner_count": 0, "ner_person_ratio": 0.0, "ner_org_ratio": 0.0,
                "ner_loc_ratio": 0.0, "ner_misc_ratio": 0.0}
    
    ents = doc.ents
    return {
        "ner_count": len(ents),
        "ner_person_ratio": safe_divide(sum(1 for e in ents if e.label_ == "persName"), len(alpha_words)),
        "ner_org_ratio": safe_divide(sum(1 for e in ents if e.label_ == "orgName"), len(alpha_words)),
        "ner_loc_ratio": safe_divide(sum(1 for e in ents if e.label_ in ["placeName", "locName"]), len(alpha_words)),
        "ner_misc_ratio": safe_divide(sum(1 for e in ents if e.label_ not in ["persName", "orgName", "placeName", "locName"]), len(alpha_words)),
    }

def analyze_morphology(doc: spacy.tokens.Doc) -> Dict[str, float]:
    """Analizuje różnorodność morfologiczną."""
    alpha_tokens = [t for t in doc if t.is_alpha]
    if not alpha_tokens:
        return {"case_diversity": 0.0, "tense_diversity": 0.0, "mood_diversity": 0.0}
    
    cases, tenses, moods = [], [], []
    for token in alpha_tokens:
        if token.morph:
            cases.extend(token.morph.get("Case", []))
            tenses.extend(token.morph.get("Tense", []))
            moods.extend(token.morph.get("Mood", []))
            
    return {"case_diversity": safe_divide(len(set(cases)), len(alpha_tokens)),
            "tense_diversity": safe_divide(len(set(tenses)), len(alpha_tokens)),
            "mood_diversity": safe_divide(len(set(moods)), len(alpha_tokens))}

def analyze_dependency_complexity(doc: spacy.tokens.Doc) -> Dict[str, float]:
    """Oblicza średnią głębokość drzewa zależności."""
    depths = []
    for sent in doc.sents:
        if not list(sent): continue
        max_depth = 0
        for token in sent:
            dist = 0
            curr = token
            while curr.head != curr and dist < 100:
                curr = curr.head
                dist += 1
            max_depth = max(max_depth, dist)
        depths.append(max_depth)
    return {"avg_dependency_tree_depth": mean(depths) if depths else 0.0}

def analyze_pos_frequencies(doc: spacy.tokens.Doc, top_k=10) -> Dict[str, float]:
    """Analizuje częstotliwość POS dla najczęstszych słów."""
    tokens = [t for t in doc if t.is_alpha]
    if not tokens:
        return {"top_words_total_count": 0, "top_words_noun_ratio": 0.0, "top_words_verb_ratio": 0.0,
                "top_words_adj_ratio": 0.0, "top_words_other_ratio": 0.0, "top_words_noun_prop_of_all_nouns": 0.0,
                "top_words_verb_prop_of_all_verbs": 0.0, "top_words_adj_prop_of_all_adjs": 0.0,
                "top_words_other_prop_of_all_others": 0.0}
    
    word_counts = Counter(t.text.lower() for t in tokens)
    top_words_list = [w for w, _ in word_counts.most_common(top_k)]
    
    top_tokens = [t for t in tokens if t.text.lower() in top_words_list]
    total_top_count = len(top_tokens)
    
    top_noun = sum(1 for t in top_tokens if t.pos_ == 'NOUN')
    top_verb = sum(1 for t in top_tokens if t.pos_ == 'VERB')
    top_adj = sum(1 for t in top_tokens if t.pos_ == 'ADJ')
    top_other = total_top_count - (top_noun + top_verb + top_adj)
    
    total_nouns = sum(1 for t in tokens if t.pos_ == "NOUN")
    total_verbs = sum(1 for t in tokens if t.pos_ == "VERB")
    total_adjs = sum(1 for t in tokens if t.pos_ == "ADJ")
    total_others = len(tokens) - (total_nouns + total_verbs + total_adjs)
    
    return {
        "top_words_total_count": total_top_count,
        "top_words_noun_ratio": safe_divide(top_noun, total_top_count),
        "top_words_verb_ratio": safe_divide(top_verb, total_top_count),
        "top_words_adj_ratio": safe_divide(top_adj, total_top_count),
        "top_words_other_ratio": safe_divide(top_other, total_top_count),
        "top_words_noun_prop_of_all_nouns": safe_divide(top_noun, total_nouns),
        "top_words_verb_prop_of_all_verbs": safe_divide(top_verb, total_verbs),
        "top_words_adj_prop_of_all_adjs": safe_divide(top_adj, total_adjs),
        "top_words_other_prop_of_all_others": safe_divide(top_other, total_others),
    }

# --- Funkcje wymagające tylko tekstu ---

def compute_readability_indices(text: str, sentences: List[str]) -> Dict[str, float]:
    """Oblicza wskaźniki czytelności LIX i RIX."""
    if not text.strip(): return {"lix": 0.0, "rix": 0.0}
    words = re.findall(r'\w+', text)
    num_words = len(words)
    num_sentences = len(sentences)
    long_words = sum(1 for w in words if len(w) > 6)
    lix = safe_divide(num_words, num_sentences) + safe_divide(long_words * 100, num_words)
    rix = safe_divide(long_words, num_words) * 100
    return {"lix": lix, "rix": rix}

def analyze_polish_diacritics_distribution(text: str) -> Dict[str, float]:
    """Analizuje rozkład polskich znaków diakrytycznych."""
    polish_diacritics = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ'
    total = len(text)
    if total == 0: return {"diacritics_std_dev": 0.0}
    counts = Counter(text)
    diac_counts = [counts[ch] for ch in polish_diacritics if ch in counts]
    if not diac_counts: return {"diacritics_std_dev": 0.0}
    diac_freqs = [c / total for c in diac_counts]
    mean_freq = mean(diac_freqs)
    variance_val = sum((x - mean_freq) ** 2 for x in diac_freqs) / len(diac_freqs)
    return {"diacritics_std_dev": math.sqrt(variance_val)}

def analyze_question_sentences(sentences: List[str]) -> Dict[str, float]:
    """Oblicza stosunek zdań pytających do wszystkich."""
    if not sentences: return {"question_sentence_ratio": 0.0}
    questions = sum(1 for s in sentences if s.strip().endswith('?'))
    return {"question_sentence_ratio": safe_divide(questions, len(sentences))}

# --- Główna funkcja agregująca ---

def calculate_all_spacy_features(doc: spacy.tokens.Doc, text: str, sentences: List[str]) -> Dict[str, float]:
    """Agreguje wszystkie zaawansowane cechy lingwistyczne."""
    features = {}
    features.update(analyze_pos_stats(doc))
    features.update(analyze_doc_level_stats(doc, text))
    features.update(analyze_named_entities(doc))
    features.update(analyze_morphology(doc))
    features.update(analyze_dependency_complexity(doc))
    features.update(analyze_pos_frequencies(doc))
    features.update(compute_readability_indices(text, sentences))
    features.update(analyze_polish_diacritics_distribution(text))
    features.update(analyze_question_sentences(sentences))
    return features