""" Moduł do ekstrakcji podstawowych, statystycznych cech tekstu. Zawiera funkcje do analizy na poziomie znaków, słów i linii, które nie wymagają zaawansowanych modeli lingwistycznych. """ import re from collections import Counter from typing import Dict, List from ..utils import safe_divide from ..constants import (PUNCTUATION_PATTERN, EXCESSIVE_SPACES_PATTERN, ALLOWED_CHARS_PATTERN, COMMON_CHARACTERS) # --- Funkcje analizujące znaki --- def analyze_character_stats(text: str, text_lower: str) -> Dict[str, float]: """Oblicza podstawowe statystyki na poziomie znaków.""" total_chars = len(text) char_counts = Counter(text) if not total_chars: return { 'characters': 0, 'digit_count': 0, 'digit_ratio': 0.0, 'overall_uppercase_ratio': 0.0, 'unique_characters_all': 0, 'unique_characters_lower': 0, 'characters_out_of_common': 0, 'tabs': 0, 'multispaces': 0 } return { 'characters': total_chars, 'digit_count': sum(ch.isdigit() for ch in text), 'digit_ratio': safe_divide(sum(ch.isdigit() for ch in text), total_chars), 'overall_uppercase_ratio': safe_divide(sum(ch.isupper() for ch in text), total_chars), 'unique_characters_all': len(set(text)), 'unique_characters_lower': len(set(text_lower)), 'characters_out_of_common': len([c for c in text if c not in COMMON_CHARACTERS]), 'tabs': text.count('\t'), 'multispaces': len(EXCESSIVE_SPACES_PATTERN.findall(text)) } def analyze_punctuation_stats(text: str) -> Dict[str, float]: """Analizuje występowanie interpunkcji i specyficznych znaków.""" total_chars = len(text) if not total_chars: return { 'punct_frequency': 0.0, 'bracet_count': 0, 'bracket_ratio': 0.0, 'count_special_chars': 0 } open_paren = text.count('(') close_paren = text.count(')') open_bracket = text.count('[') close_bracket = text.count(']') return { 'punct_frequency': safe_divide(len(PUNCTUATION_PATTERN.findall(text)), total_chars), 'bracet_count': open_paren + close_paren + open_bracket + close_bracket, 'bracket_ratio': safe_divide(open_bracket, close_bracket), 'count_special_chars': len(re.findall(r'(\?|!){3,}', text)) } def analyze_advanced_char_features(text: str) -> Dict[str, float]: """Analizuje zaawansowane cechy rozkładu znaków i słów (dawniej analyze_char_features).""" total_chars = len(text) words_found = re.findall(r'\w+', text) word_count = len(words_found) if not total_chars or not word_count: return { 'word_count': 0, 'unique_word_count': 0, 'top_word_count': 0, 'top_word_ratio': 0.0, 'top_5_ratio': 0.0, 'top_10_ratio': 0.0, 'hapax_legomena_ratio': 0.0, 'looping_suspicion': 0, 'polish_diacritics_count': 0, 'polish_diacritics_ratio': 0.0, 'polish_diacritics_per_word': 0.0, 'diacritics_to_letters_ratio': 0.0, 'replacement_char_count': 0, 'replacement_char_ratio': 0.0, 'not_allowed_chars_count': 0, 'not_allowed_chars_ratio': 0.0, 'encoding_suspicion': 0, 'single_char_word_count': 0, 'single_char_unique_count': 0, 'single_char_upper_count': 0, 'single_char_lower_count': 0, 'single_char_upper_unique_count': 0, 'single_char_lower_unique_count': 0, 'single_char_top_1_codepoint': 0, 'single_char_top_2_codepoint': 0, 'single_char_top_3_codepoint': 0 } word_freq = Counter(words_found) most_common = word_freq.most_common(10) polish_diacritics = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ' char_counts = Counter(text) diac_count = sum(char_counts.get(ch, 0) for ch in polish_diacritics) letters_count = sum(1 for ch in text if ch.isalpha()) single_chars = [w for w in words_found if len(w) == 1] single_char_freq = Counter(single_chars) top_3_single = single_char_freq.most_common(3) top_codes = [ord(w) for w, _ in top_3_single] while len(top_codes) < 3: top_codes.append(0) replacement_count = char_counts.get('\uFFFD', 0) not_allowed_count = sum(1 for ch in text if not ALLOWED_CHARS_PATTERN.match(ch)) replacement_ratio = safe_divide(replacement_count, total_chars) not_allowed_ratio = safe_divide(not_allowed_count, total_chars) top_word_ratio = safe_divide(most_common[0][1] if most_common else 0, word_count) top_5_ratio = safe_divide(sum(cnt for _, cnt in most_common[:5]), word_count) features = { 'word_count': word_count, 'unique_word_count': len(word_freq), 'top_word_count': most_common[0][1] if most_common else 0, 'top_word_ratio': top_word_ratio, 'top_5_ratio': top_5_ratio, 'top_10_ratio': safe_divide(sum(cnt for _, cnt in most_common[:10]), word_count), 'hapax_legomena_ratio': safe_divide(sum(1 for cnt in word_freq.values() if cnt == 1), word_count), 'looping_suspicion': 1 if (top_word_ratio > 0.15 or top_5_ratio > 0.4) else 0, 'polish_diacritics_count': diac_count, 'polish_diacritics_ratio': safe_divide(diac_count, total_chars), 'polish_diacritics_per_word': safe_divide(diac_count, word_count), 'diacritics_to_letters_ratio': safe_divide(diac_count, letters_count), 'replacement_char_count': replacement_count, 'replacement_char_ratio': replacement_ratio, 'not_allowed_chars_count': not_allowed_count, 'not_allowed_chars_ratio': not_allowed_ratio, 'encoding_suspicion': 1 if (replacement_ratio > 0.01 or not_allowed_ratio > 0.05) else 0, 'single_char_word_count': len(single_chars), 'single_char_unique_count': len(single_char_freq), 'single_char_upper_count': sum(1 for w in single_chars if w.isupper()), 'single_char_lower_count': sum(1 for w in single_chars if w.islower()), 'single_char_upper_unique_count': len({w for w in single_chars if w.isupper()}), 'single_char_lower_unique_count': len({w for w in single_chars if w.islower()}), 'single_char_top_1_codepoint': top_codes[0], 'single_char_top_2_codepoint': top_codes[1], 'single_char_top_3_codepoint': top_codes[2], } return features # --- Funkcje analizujące słowa --- def analyze_word_stats(words: List[str], words_lower: List[str]) -> Dict[str, float]: total_words = len(words) if not total_words: return {'mean_word_length': 0.0, 'lexical_diversity': 0.0, 'count_caps': 0.0, 'word_isupper<5': 0, 'word_isupper>5': 0, 'count_digit_to_caps': 0.0} digit_count = sum(1 for w in words if any(ch.isdigit() for ch in w)) caps_count = sum(1 for w in words if w.isupper()) return { 'mean_word_length': safe_divide(sum(len(w) for w in words_lower), total_words), 'lexical_diversity': safe_divide(len(set(words_lower)), total_words), 'count_caps': safe_divide(caps_count, total_words), 'word_isupper<5': sum(1 for w in words if w.isupper() and len(w) < 5), 'word_isupper>5': sum(1 for w in words if w.isupper() and len(w) >= 5), 'count_digit_to_caps': safe_divide(digit_count, caps_count) } def count_contextual_word_repetitions(words_lower: List[str]) -> Dict[str, float]: """Liczy powtórzenia tego samego słowa bezpośrednio po sobie.""" count = sum(1 for i in range(len(words_lower) - 1) if words_lower[i] == words_lower[i+1]) return { "contextual_word_repetitions_count": count, "contextual_word_repetitions_ratio": safe_divide(count, len(words_lower)) } def count_single_chars_and_ratio(text: str) -> Dict[str, float]: """Liczy słowa składające się z jednego znaku (wersja z oryginalnego kodu).""" t = " " + text + " " count = sum(1 for i in range(1, len(t) - 1) if t[i-1].isspace() and t[i+1].isspace()) return { 'single_char_count': count, 'single_char_ratio': safe_divide(count, len(t)) } # --- Funkcje analizujące linie --- def analyze_line_length_stats(lines: List[str]) -> Dict[str, float]: """Oblicza statystyki związane z długością linii.""" total_lines = len(lines) if not total_lines: return { 'average_lines': 0.0, 'short_line_count_3': 0, 'short_line_count_5': 0, 'short_line_count_10': 0, 'short_line_count_20': 0, 'short_line_ratio_3': 0.0, 'short_line_ratio_5': 0.0, 'short_line_ratio_10': 0.0, 'short_line_ratio_20': 0.0 } line_lengths = [len(line) for line in lines] stats = {'average_lines': safe_divide(sum(line_lengths), total_lines)} for threshold in [3, 5, 10, 20]: count = sum(1 for length in line_lengths if length < threshold) stats[f'short_line_count_{threshold}'] = count stats[f'short_line_ratio_{threshold}'] = safe_divide(count, total_lines) return stats def analyze_line_content(lines: List[str]) -> Dict[str, float]: """Analizuje zawartość linii pod kątem specyficznych wzorców.""" total_lines = len(lines) if not total_lines: return { 'blank_lines': 0, 'blank_lines_ratio': 0.0, 'ellipsis_fractions': 0.0, 'line_counts': 0, 'digit_start_lines': 0, 'duplicated_lines': 0, 'duplicate_line_ratio': 0.0 } non_empty_lines = [line for line in lines if line.strip()] blanks_count = total_lines - len(non_empty_lines) ellipsis_lines_count = sum(1 for line in lines if line.strip().endswith(('...', '…'))) digit_start_lines_count = sum(1 for line in non_empty_lines if line.strip() and line.strip()[0].isdigit()) line_counts = Counter(non_empty_lines) duplicated_lines_count = sum(cnt - 1 for cnt in line_counts.values() if cnt > 1) return { 'blank_lines': blanks_count, 'blank_lines_ratio': safe_divide(blanks_count, total_lines), 'ellipsis_fractions': safe_divide(ellipsis_lines_count, total_lines), 'line_counts': total_lines, 'digit_start_lines': digit_start_lines_count, 'duplicated_lines': duplicated_lines_count, 'duplicate_line_ratio': safe_divide(duplicated_lines_count, len(non_empty_lines)) } def count_lorem_ipsum(text_lower: str) -> Dict[str, float]: """Oblicza stosunek lorem ipsum""" count = text_lower.count('lorem ipsum') return {'lorem_ipsum_ratio': safe_divide(count, len(text_lower))} # --- Główna funkcja agregująca --- def calculate_all_base_features(text: str, text_lower: str, words: List[str], words_lower: List[str], lines: List[str]) -> Dict[str, float]: """Agreguje wszystkie podstawowe cechy tekstu z tego modułu.""" features = {} features.update(analyze_character_stats(text, text_lower)) features.update(analyze_punctuation_stats(text)) features.update(analyze_advanced_char_features(text)) features.update(analyze_word_stats(words, words_lower)) features.update(count_contextual_word_repetitions(words_lower)) features.update(count_single_chars_and_ratio(text)) features.update(analyze_line_length_stats(lines)) features.update(analyze_line_content(lines)) features.update(count_lorem_ipsum(text_lower)) return features