Spaces:
Runtime error
Runtime error
Upload fuzzy_matching.py
Browse filesfeat: a python script with functions used to process and map users locations to the most similar matches from a reference dataset of town names
- src/fuzzy_matching.py +258 -0
src/fuzzy_matching.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Problem:
|
| 3 |
+
Nt3awnou's platform collects raw data filled manually by users (people in need).
|
| 4 |
+
Among this data is the user's localisation.
|
| 5 |
+
The localisation is a text input that is not standardized:
|
| 6 |
+
i.e. a user can input a single or multiple locations
|
| 7 |
+
(either douars/provinces/communes/regions or all combined),
|
| 8 |
+
in arabic or latin, with misspellings etc.
|
| 9 |
+
This doesn't help in visualization or in statistics
|
| 10 |
+
where localisations can be redundant because they were written in different manners.
|
| 11 |
+
|
| 12 |
+
Examples
|
| 13 |
+
```
|
| 14 |
+
دوار تجكَالت
|
| 15 |
+
ابرداتن ازكور
|
| 16 |
+
خزامة
|
| 17 |
+
Tansgharte
|
| 18 |
+
دوار امندار
|
| 19 |
+
Douar Essour Tidrara Aghwatim Tahnaouet Al Haouz
|
| 20 |
+
دوار تكاديرت
|
| 21 |
+
Douar Essour tidrara- aghouatine- Tahanaout-El Haouz
|
| 22 |
+
```
|
| 23 |
+
Solution:
|
| 24 |
+
We collected a reference dataset that contains all douar names (arabic and latin)
|
| 25 |
+
with their corresponding regions, communes and provinces.
|
| 26 |
+
We developed methods using fuzzy matching and phonetics
|
| 27 |
+
to map the user's localisation to the closest match in the reference dataset
|
| 28 |
+
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
from typing import Tuple
|
| 32 |
+
from pyphonetics import RefinedSoundex, Metaphone
|
| 33 |
+
import math
|
| 34 |
+
import difflib
|
| 35 |
+
import re
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
EPICENTER_LOCATION = [31.12210171476489, -8.42945837915193]
|
| 39 |
+
certainty_threshold = 1
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def extract_ngrams(text, n):
|
| 43 |
+
"""
|
| 44 |
+
A function that returns a list of n-grams from a text
|
| 45 |
+
"""
|
| 46 |
+
ngrams = []
|
| 47 |
+
|
| 48 |
+
if n < 1 or n > len(text):
|
| 49 |
+
return ngrams # Return an empty list if n is invalid
|
| 50 |
+
|
| 51 |
+
# Iterate through the text and extract n-grams
|
| 52 |
+
for i in range(len(text) - n + 1):
|
| 53 |
+
ngram = text[i:i + n]
|
| 54 |
+
ngrams.append(' '.join(ngram))
|
| 55 |
+
|
| 56 |
+
return ngrams
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def get_phonetics_distance(w1, w2):
|
| 60 |
+
"""
|
| 61 |
+
A function that calculates levenhstein distance between phonetics
|
| 62 |
+
representation of two words: add error term to the score
|
| 63 |
+
"""
|
| 64 |
+
rs = RefinedSoundex()
|
| 65 |
+
mt = Metaphone()
|
| 66 |
+
d1 = mt.distance(w1, w2, metric='levenshtein')
|
| 67 |
+
d2 = rs.distance(w1, w2, metric='levenshtein')
|
| 68 |
+
res = (d1 + d2) / 2 + 0.05
|
| 69 |
+
return res
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def get_top_n_phonetics_matches(
|
| 73 |
+
w: str, ref_words: list, threshold=1, top_n=1) -> list[Tuple]:
|
| 74 |
+
"""
|
| 75 |
+
A function that returns the top_n closest words to w from ref_words
|
| 76 |
+
for which distance <= threshold
|
| 77 |
+
using phonetical representation
|
| 78 |
+
"""
|
| 79 |
+
if not w:
|
| 80 |
+
return list()
|
| 81 |
+
distances = {x: get_phonetics_distance(w, x) for x in ref_words}
|
| 82 |
+
selected_words = {x: d for x, d in distances.items() if d<=threshold}
|
| 83 |
+
sorted_d = dict(sorted(selected_words.items(), key=lambda item: item[1]))
|
| 84 |
+
|
| 85 |
+
return list(sorted_d.items())[:top_n]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def get_geometric_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
|
| 89 |
+
"""
|
| 90 |
+
A function that returns the distance between two points on earth
|
| 91 |
+
using the haversine formula
|
| 92 |
+
"""
|
| 93 |
+
dlon = math.radians(lon2 - lon1)
|
| 94 |
+
dlat = math.radians(lat2 - lat1)
|
| 95 |
+
a0 = (math.sin(dlat / 2)) ** 2 + math.cos(math.radians(lat1))
|
| 96 |
+
a = a0 * math.cos(math.radians(lat2)) * (math.sin(dlon / 2)) ** 2
|
| 97 |
+
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
|
| 98 |
+
distance = 6371 * c
|
| 99 |
+
return distance
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def are_village_names_similar(village_a: str, village_b: str) -> float:
|
| 103 |
+
"""
|
| 104 |
+
A function that returns True if the two villages
|
| 105 |
+
are similar using strict fuzzy matching
|
| 106 |
+
"""
|
| 107 |
+
if difflib.SequenceMatcher(None, village_a, village_b).ratio() >= 0.90:
|
| 108 |
+
return True
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def get_uncertainty_range(input_dict: dict, threshold: float) -> list:
|
| 113 |
+
"""
|
| 114 |
+
A function that returns a list of tuples of the closest matches
|
| 115 |
+
"""
|
| 116 |
+
if len(input_dict)<=1:
|
| 117 |
+
return input_dict
|
| 118 |
+
|
| 119 |
+
# sort by distance
|
| 120 |
+
sorted_items = sorted(input_dict.items(), key=lambda item: item[1][1])
|
| 121 |
+
data = {key: value for key, value in sorted_items}
|
| 122 |
+
|
| 123 |
+
# Iterate through the keys in the dictionary
|
| 124 |
+
keys = list(data.keys())
|
| 125 |
+
min_key = keys[0]
|
| 126 |
+
min_value = data[min_key][1]
|
| 127 |
+
|
| 128 |
+
# Initialize a list to store the result tuples
|
| 129 |
+
result = {f"{min_key}":data[min_key]}
|
| 130 |
+
|
| 131 |
+
for j in range(1, len(keys)):
|
| 132 |
+
key2 = keys[j]
|
| 133 |
+
value2 = data[key2][1]
|
| 134 |
+
|
| 135 |
+
# Calculate the absolute difference between the float values
|
| 136 |
+
difference = abs(min_value - value2)
|
| 137 |
+
|
| 138 |
+
# If the difference is less than the threshold, add the tuple to the result
|
| 139 |
+
if difference <= threshold:
|
| 140 |
+
result[key2] = data[key2]
|
| 141 |
+
else:
|
| 142 |
+
break
|
| 143 |
+
|
| 144 |
+
return result
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def match_word(w, ref_dict, select_one_match=False):
|
| 148 |
+
"""
|
| 149 |
+
A function that returns the closest match of w from ref_dict
|
| 150 |
+
using phonetical representation and fuzzy matching
|
| 151 |
+
"""
|
| 152 |
+
w = w.strip().upper()
|
| 153 |
+
|
| 154 |
+
if len(w)==0:
|
| 155 |
+
return {}
|
| 156 |
+
|
| 157 |
+
else:
|
| 158 |
+
closest_ref_w = dict()
|
| 159 |
+
use_phonetics = True
|
| 160 |
+
|
| 161 |
+
for category, names in ref_dict.items():
|
| 162 |
+
# check exact matching
|
| 163 |
+
if w in names:
|
| 164 |
+
use_phonetics = False
|
| 165 |
+
closest_ref_w[category] = (w, 0)
|
| 166 |
+
break
|
| 167 |
+
|
| 168 |
+
# check textual similarity (fuzzy matching)
|
| 169 |
+
sim = list(map(lambda x:are_village_names_similar(w,x), names))
|
| 170 |
+
similar_names = [names[i] for i in range(len(names)) if sim[i]==True]
|
| 171 |
+
if similar_names:
|
| 172 |
+
use_phonetics = False
|
| 173 |
+
closest_ref_w[category] = (similar_names[0], 0.01) if select_one_match else list(map(lambda x:(x, 0.01), similar_names))
|
| 174 |
+
|
| 175 |
+
# if no similar name was found check phonetical similarity
|
| 176 |
+
else:
|
| 177 |
+
res = get_top_n_phonetics_matches(w, names, threshold=2, top_n=1)
|
| 178 |
+
if res:
|
| 179 |
+
closest_ref_w[category] = res[0] # get closest match
|
| 180 |
+
|
| 181 |
+
if closest_ref_w and use_phonetics:
|
| 182 |
+
if not select_one_match:
|
| 183 |
+
closest_ref_w = get_uncertainty_range(closest_ref_w, certainty_threshold)
|
| 184 |
+
else:
|
| 185 |
+
k, v = min(closest_ref_w.items(), key=lambda x: x[1][1])
|
| 186 |
+
closest_ref_w = {k: v}
|
| 187 |
+
|
| 188 |
+
return closest_ref_w
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def parse_and_map_localisation(text: str, ref_dict: dict, select_one_match: bool=True):
|
| 192 |
+
"""
|
| 193 |
+
A function that parses text containing users localisation
|
| 194 |
+
and returns the closest matches per categoty from ref_dict
|
| 195 |
+
Example:
|
| 196 |
+
input = COMMUNE MZODA : DOUARS : TOUKHRIBIN –TLAKEMT - COMMUNE IMINDOUNITE : DOUAR AZARZO
|
| 197 |
+
output = {'commune_fr': ('IMINDOUNIT', 0.01), 'nom_fr': ('TOUKHRIBINE', 0.01)}
|
| 198 |
+
"""
|
| 199 |
+
toxic = r"\bدوار|مصلى|\(|\)|douars?|communes?|cercles?|provinces?|villes?|regions?|caidate?|and|جماعة|\b|:|-|\d"
|
| 200 |
+
text = re.sub(toxic, '', text.lower())
|
| 201 |
+
regex_pattern = r"\|| |\.|,|/|et |و "
|
| 202 |
+
tokens = re.split(regex_pattern, text.replace('-', ' '))
|
| 203 |
+
filtered_tokens = [s for s in tokens if s.strip() != '']
|
| 204 |
+
|
| 205 |
+
ngrams_mapping = {}
|
| 206 |
+
|
| 207 |
+
for n in range(1, len(filtered_tokens)+1):
|
| 208 |
+
|
| 209 |
+
# generate ngrams
|
| 210 |
+
ngrams = extract_ngrams(filtered_tokens, n)
|
| 211 |
+
|
| 212 |
+
# init dict with ngram mapping
|
| 213 |
+
mapping_ngram = {}
|
| 214 |
+
|
| 215 |
+
# generate a mapping for the ngram with argmin matches
|
| 216 |
+
for tok in ngrams:
|
| 217 |
+
res = match_word(tok, ref_dict, select_one_match=select_one_match)
|
| 218 |
+
if not res:
|
| 219 |
+
continue
|
| 220 |
+
|
| 221 |
+
min_k, min_v = min(res.items(), key=lambda x:x[1][1])
|
| 222 |
+
|
| 223 |
+
# if min_k in previous tokens, then choose the min, else add it to mapping
|
| 224 |
+
if min_k in mapping_ngram:
|
| 225 |
+
saved_match, saved_distance = mapping_ngram[min_k]
|
| 226 |
+
|
| 227 |
+
if saved_distance > min_v[1]:
|
| 228 |
+
mapping_ngram[min_k] = min_v
|
| 229 |
+
|
| 230 |
+
else:
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
else:
|
| 234 |
+
mapping_ngram[min_k] = min_v
|
| 235 |
+
|
| 236 |
+
ngrams_mapping[n] = mapping_ngram
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
# first squeeze dict s.t. one match remains per category
|
| 240 |
+
categories = ref_dict.keys()
|
| 241 |
+
result = {}
|
| 242 |
+
for _, inner_dict in ngrams_mapping.items():
|
| 243 |
+
for k in categories:
|
| 244 |
+
# Check if the key exists in the inner dictionary
|
| 245 |
+
if k in inner_dict:
|
| 246 |
+
current_match, current_val = inner_dict[k]
|
| 247 |
+
if k in result:
|
| 248 |
+
previous_match, previous_val = result[k]
|
| 249 |
+
if current_val < previous_val:
|
| 250 |
+
result[k] = (current_match, current_val)
|
| 251 |
+
else:
|
| 252 |
+
result[k] = (current_match, current_val)
|
| 253 |
+
|
| 254 |
+
# then, discard matches with a high distance from min (set 0.5+min_d as threshold)
|
| 255 |
+
thresh = min(result.values(), key=lambda x:x[1])[1] + 0.5
|
| 256 |
+
output = {k: v_d for k, v_d in result.items() if v_d[1]<=thresh}
|
| 257 |
+
|
| 258 |
+
return output
|