|
|
"""
|
|
|
Modu艂 do ekstrakcji cech opartych na wyra偶eniach regularnych.
|
|
|
|
|
|
Odpowiedzialny za wykrywanie i zliczanie predefiniowanych wzorc贸w,
|
|
|
takich jak dane osobowe i wra偶liwe (PII), numery identyfikacyjne,
|
|
|
daty, adresy e-mail i inne specyficzne formaty.
|
|
|
"""
|
|
|
|
|
|
from typing import Dict
|
|
|
|
|
|
from ..constants import PII_REGEX_PATTERNS
|
|
|
|
|
|
def calculate_all_regex_features(text: str) -> Dict[str, int]:
|
|
|
"""
|
|
|
Przeszukuje tekst w poszukiwaniu wszystkich zdefiniowanych wzorc贸w i
|
|
|
zlicza ich wyst膮pienia, zachowuj膮c zgodno艣膰 z oryginalnym zestawem cech.
|
|
|
|
|
|
Args:
|
|
|
text (str): Tekst do analizy.
|
|
|
|
|
|
Returns:
|
|
|
Dict[str, int]: S艂ownik, w kt贸rym klucze to nazwy wzorc贸w (np. 'email_reg'),
|
|
|
a warto艣ci to liczba znalezionych dopasowa艅.
|
|
|
"""
|
|
|
if not text:
|
|
|
|
|
|
|
|
|
original_keys = [k for k in PII_REGEX_PATTERNS.keys() if k != 'domestic_phone_reg']
|
|
|
return {name: 0 for name in original_keys}
|
|
|
|
|
|
features = {}
|
|
|
|
|
|
|
|
|
for name, pattern in PII_REGEX_PATTERNS.items():
|
|
|
try:
|
|
|
matches = pattern.findall(text)
|
|
|
features[name] = len(matches)
|
|
|
except Exception as e:
|
|
|
print(f"B艂膮d podczas przetwarzania wzorca '{name}': {e}")
|
|
|
features[name] = 0
|
|
|
|
|
|
|
|
|
|
|
|
if 'phone_reg' in features and 'domestic_phone_reg' in features:
|
|
|
features['phone_reg'] += features['domestic_phone_reg']
|
|
|
|
|
|
del features['domestic_phone_reg']
|
|
|
|
|
|
return features |