|
|
"""
|
|
|
Modu艂 do ekstrakcji zaawansowanych cech lingwistycznych przy u偶yciu spaCy.
|
|
|
"""
|
|
|
import re
|
|
|
import math
|
|
|
from collections import Counter
|
|
|
from statistics import mean, variance
|
|
|
from typing import Dict, List
|
|
|
|
|
|
import textstat
|
|
|
import spacy
|
|
|
|
|
|
from ..utils import safe_divide
|
|
|
from ..constants import CAMEL_CASE_PATTERN
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_pos_stats(doc: spacy.tokens.Doc) -> Dict[str, float]:
|
|
|
"""Oblicza statystyki cz臋艣ci mowy (POS), interpunkcji i stopwords."""
|
|
|
words = [t for t in doc if not t.is_punct and not t.is_space and t.pos_ != 'SYM']
|
|
|
words_count = len(words)
|
|
|
|
|
|
if not words_count:
|
|
|
return {'words': 0, 'nouns': 0, 'verbs': 0, 'adjectives': 0, 'adverbs': 0,
|
|
|
'punctuations': 0, 'symbols': 0, 'stopwords': 0, 'oovs': 0,
|
|
|
'pos_x': 0, 'pos_num': 0, 'noun_ratio': 0.0, 'verb_ratio': 0.0,
|
|
|
'adj_ratio': 0.0}
|
|
|
|
|
|
stats = {
|
|
|
'words': words_count, 'nouns': sum(1 for t in doc if t.pos_ == "NOUN"),
|
|
|
'verbs': sum(1 for t in doc if t.pos_ == "VERB"),
|
|
|
'adjectives': sum(1 for t in doc if t.pos_ == "ADJ"),
|
|
|
'adverbs': sum(1 for t in doc if t.pos_ == "ADV"),
|
|
|
'punctuations': sum(1 for t in doc if t.is_punct),
|
|
|
'symbols': sum(1 for t in doc if t.pos_ == "SYM"),
|
|
|
'stopwords': sum(1 for t in doc if t.is_stop),
|
|
|
'oovs': sum(1 for t in doc if t.is_oov),
|
|
|
'pos_x': sum(1 for t in doc if t.pos_ == "X"),
|
|
|
'pos_num': sum(1 for t in doc if t.pos_ == "NUM"),
|
|
|
}
|
|
|
stats['noun_ratio'] = safe_divide(stats['nouns'], words_count)
|
|
|
stats['verb_ratio'] = safe_divide(stats['verbs'], words_count)
|
|
|
stats['adj_ratio'] = safe_divide(stats['adjectives'], words_count)
|
|
|
return stats
|
|
|
|
|
|
def analyze_doc_level_stats(doc: spacy.tokens.Doc, text: str) -> Dict[str, float]:
|
|
|
"""Analizuje cechy na poziomie ca艂ego dokumentu."""
|
|
|
words = [t for t in doc if not t.is_punct and not t.is_space and t.pos_ != 'SYM']
|
|
|
words_count = len(words)
|
|
|
sentences_count = len(list(doc.sents))
|
|
|
|
|
|
return {
|
|
|
'sentences': sentences_count,
|
|
|
'avg_word_length': safe_divide(sum(len(t.text) for t in words), words_count),
|
|
|
'avg_sentence_length': safe_divide(words_count, sentences_count),
|
|
|
'lexical_density': safe_divide(len({t.lemma_ for t in words}), words_count),
|
|
|
'gunning_fog': textstat.gunning_fog(text) if text.strip() else 0.0,
|
|
|
'camel_case': sum(1 for t in words if CAMEL_CASE_PATTERN.match(t.text)),
|
|
|
'capitalized_words': sum(1 for t in words if t.text.isupper()),
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_named_entities(doc: spacy.tokens.Doc) -> Dict[str, float]:
|
|
|
"""Analizuje rozpoznane jednostki nazwane (NER)."""
|
|
|
alpha_words = [t for t in doc if t.is_alpha]
|
|
|
if not alpha_words:
|
|
|
return {"ner_count": 0, "ner_person_ratio": 0.0, "ner_org_ratio": 0.0,
|
|
|
"ner_loc_ratio": 0.0, "ner_misc_ratio": 0.0}
|
|
|
|
|
|
ents = doc.ents
|
|
|
return {
|
|
|
"ner_count": len(ents),
|
|
|
"ner_person_ratio": safe_divide(sum(1 for e in ents if e.label_ == "persName"), len(alpha_words)),
|
|
|
"ner_org_ratio": safe_divide(sum(1 for e in ents if e.label_ == "orgName"), len(alpha_words)),
|
|
|
"ner_loc_ratio": safe_divide(sum(1 for e in ents if e.label_ in ["placeName", "locName"]), len(alpha_words)),
|
|
|
"ner_misc_ratio": safe_divide(sum(1 for e in ents if e.label_ not in ["persName", "orgName", "placeName", "locName"]), len(alpha_words)),
|
|
|
}
|
|
|
|
|
|
def analyze_morphology(doc: spacy.tokens.Doc) -> Dict[str, float]:
|
|
|
"""Analizuje r贸偶norodno艣膰 morfologiczn膮."""
|
|
|
alpha_tokens = [t for t in doc if t.is_alpha]
|
|
|
if not alpha_tokens:
|
|
|
return {"case_diversity": 0.0, "tense_diversity": 0.0, "mood_diversity": 0.0}
|
|
|
|
|
|
cases, tenses, moods = [], [], []
|
|
|
for token in alpha_tokens:
|
|
|
if token.morph:
|
|
|
cases.extend(token.morph.get("Case", []))
|
|
|
tenses.extend(token.morph.get("Tense", []))
|
|
|
moods.extend(token.morph.get("Mood", []))
|
|
|
|
|
|
return {"case_diversity": safe_divide(len(set(cases)), len(alpha_tokens)),
|
|
|
"tense_diversity": safe_divide(len(set(tenses)), len(alpha_tokens)),
|
|
|
"mood_diversity": safe_divide(len(set(moods)), len(alpha_tokens))}
|
|
|
|
|
|
def analyze_dependency_complexity(doc: spacy.tokens.Doc) -> Dict[str, float]:
|
|
|
"""Oblicza 艣redni膮 g艂臋boko艣膰 drzewa zale偶no艣ci."""
|
|
|
depths = []
|
|
|
for sent in doc.sents:
|
|
|
if not list(sent): continue
|
|
|
max_depth = 0
|
|
|
for token in sent:
|
|
|
dist = 0
|
|
|
curr = token
|
|
|
while curr.head != curr and dist < 100:
|
|
|
curr = curr.head
|
|
|
dist += 1
|
|
|
max_depth = max(max_depth, dist)
|
|
|
depths.append(max_depth)
|
|
|
return {"avg_dependency_tree_depth": mean(depths) if depths else 0.0}
|
|
|
|
|
|
def analyze_pos_frequencies(doc: spacy.tokens.Doc, top_k=10) -> Dict[str, float]:
|
|
|
"""Analizuje cz臋stotliwo艣膰 POS dla najcz臋stszych s艂贸w."""
|
|
|
tokens = [t for t in doc if t.is_alpha]
|
|
|
if not tokens:
|
|
|
return {"top_words_total_count": 0, "top_words_noun_ratio": 0.0, "top_words_verb_ratio": 0.0,
|
|
|
"top_words_adj_ratio": 0.0, "top_words_other_ratio": 0.0, "top_words_noun_prop_of_all_nouns": 0.0,
|
|
|
"top_words_verb_prop_of_all_verbs": 0.0, "top_words_adj_prop_of_all_adjs": 0.0,
|
|
|
"top_words_other_prop_of_all_others": 0.0}
|
|
|
|
|
|
word_counts = Counter(t.text.lower() for t in tokens)
|
|
|
top_words_list = [w for w, _ in word_counts.most_common(top_k)]
|
|
|
|
|
|
top_tokens = [t for t in tokens if t.text.lower() in top_words_list]
|
|
|
total_top_count = len(top_tokens)
|
|
|
|
|
|
top_noun = sum(1 for t in top_tokens if t.pos_ == 'NOUN')
|
|
|
top_verb = sum(1 for t in top_tokens if t.pos_ == 'VERB')
|
|
|
top_adj = sum(1 for t in top_tokens if t.pos_ == 'ADJ')
|
|
|
top_other = total_top_count - (top_noun + top_verb + top_adj)
|
|
|
|
|
|
total_nouns = sum(1 for t in tokens if t.pos_ == "NOUN")
|
|
|
total_verbs = sum(1 for t in tokens if t.pos_ == "VERB")
|
|
|
total_adjs = sum(1 for t in tokens if t.pos_ == "ADJ")
|
|
|
total_others = len(tokens) - (total_nouns + total_verbs + total_adjs)
|
|
|
|
|
|
return {
|
|
|
"top_words_total_count": total_top_count,
|
|
|
"top_words_noun_ratio": safe_divide(top_noun, total_top_count),
|
|
|
"top_words_verb_ratio": safe_divide(top_verb, total_top_count),
|
|
|
"top_words_adj_ratio": safe_divide(top_adj, total_top_count),
|
|
|
"top_words_other_ratio": safe_divide(top_other, total_top_count),
|
|
|
"top_words_noun_prop_of_all_nouns": safe_divide(top_noun, total_nouns),
|
|
|
"top_words_verb_prop_of_all_verbs": safe_divide(top_verb, total_verbs),
|
|
|
"top_words_adj_prop_of_all_adjs": safe_divide(top_adj, total_adjs),
|
|
|
"top_words_other_prop_of_all_others": safe_divide(top_other, total_others),
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def compute_readability_indices(text: str, sentences: List[str]) -> Dict[str, float]:
|
|
|
"""Oblicza wska藕niki czytelno艣ci LIX i RIX."""
|
|
|
if not text.strip(): return {"lix": 0.0, "rix": 0.0}
|
|
|
words = re.findall(r'\w+', text)
|
|
|
num_words = len(words)
|
|
|
num_sentences = len(sentences)
|
|
|
long_words = sum(1 for w in words if len(w) > 6)
|
|
|
lix = safe_divide(num_words, num_sentences) + safe_divide(long_words * 100, num_words)
|
|
|
rix = safe_divide(long_words, num_words) * 100
|
|
|
return {"lix": lix, "rix": rix}
|
|
|
|
|
|
def analyze_polish_diacritics_distribution(text: str) -> Dict[str, float]:
|
|
|
"""Analizuje rozk艂ad polskich znak贸w diakrytycznych."""
|
|
|
polish_diacritics = '膮膰臋艂艅贸艣藕偶膭膯臉艁艃脫艢殴呕'
|
|
|
total = len(text)
|
|
|
if total == 0: return {"diacritics_std_dev": 0.0}
|
|
|
counts = Counter(text)
|
|
|
diac_counts = [counts[ch] for ch in polish_diacritics if ch in counts]
|
|
|
if not diac_counts: return {"diacritics_std_dev": 0.0}
|
|
|
diac_freqs = [c / total for c in diac_counts]
|
|
|
mean_freq = mean(diac_freqs)
|
|
|
variance_val = sum((x - mean_freq) ** 2 for x in diac_freqs) / len(diac_freqs)
|
|
|
return {"diacritics_std_dev": math.sqrt(variance_val)}
|
|
|
|
|
|
def analyze_question_sentences(sentences: List[str]) -> Dict[str, float]:
|
|
|
"""Oblicza stosunek zda艅 pytaj膮cych do wszystkich."""
|
|
|
if not sentences: return {"question_sentence_ratio": 0.0}
|
|
|
questions = sum(1 for s in sentences if s.strip().endswith('?'))
|
|
|
return {"question_sentence_ratio": safe_divide(questions, len(sentences))}
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_all_spacy_features(doc: spacy.tokens.Doc, text: str, sentences: List[str]) -> Dict[str, float]:
|
|
|
"""Agreguje wszystkie zaawansowane cechy lingwistyczne."""
|
|
|
features = {}
|
|
|
features.update(analyze_pos_stats(doc))
|
|
|
features.update(analyze_doc_level_stats(doc, text))
|
|
|
features.update(analyze_named_entities(doc))
|
|
|
features.update(analyze_morphology(doc))
|
|
|
features.update(analyze_dependency_complexity(doc))
|
|
|
features.update(analyze_pos_frequencies(doc))
|
|
|
features.update(compute_readability_indices(text, sentences))
|
|
|
features.update(analyze_polish_diacritics_distribution(text))
|
|
|
features.update(analyze_question_sentences(sentences))
|
|
|
return features |