fuzzy matching su tmdb per un più preciso riconoscimento
This commit is contained in:
@@ -1018,6 +1018,11 @@ class Tmdb(object):
|
||||
% (buscando, len(results), page, index_results))
|
||||
return 0
|
||||
|
||||
# We sort result based on fuzzy match to detect most similar
|
||||
if len(results) > 1:
|
||||
from lib.fuzzy_match import algorithims
|
||||
results.sort(key=lambda r: algorithims.trigram(text_simple, r['title']), reverse=True)
|
||||
|
||||
# We return the number of results of this page
|
||||
self.results = results
|
||||
self.total_results = total_results
|
||||
|
||||
4
lib/fuzzy_match/__init__.py
Normal file
4
lib/fuzzy_match/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# __init__.py
|
||||
|
||||
# Version of the fuzzy-match package
|
||||
__version__ = "0.0.1"
|
||||
181
lib/fuzzy_match/algorithims.py
Normal file
181
lib/fuzzy_match/algorithims.py
Normal file
@@ -0,0 +1,181 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
import math
|
||||
from math import floor, ceil
|
||||
import re
|
||||
from collections import Counter
|
||||
# import numpy as np
|
||||
|
||||
|
||||
|
||||
def find_ngrams(string: str, split_num: int=3) -> set:
|
||||
"""
|
||||
Slice string into ngrams.
|
||||
Returns array of ngrams for the given string.
|
||||
|
||||
Arguments:
|
||||
text: the string to find ngrams for.
|
||||
split_num: the length the ngrams should be. Defaults to 3 (trigrams).
|
||||
"""
|
||||
try:
|
||||
if not string:
|
||||
return set()
|
||||
|
||||
words = [f' {x} ' for x in re.split(r'\W+', str(string).lower()) if x.strip()]
|
||||
|
||||
ngrams = set()
|
||||
|
||||
for word in words:
|
||||
for x in range(0, len(word) - split_num + 1):
|
||||
ngrams.add(word[x:x+split_num])
|
||||
|
||||
return ngrams
|
||||
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def trigram(text1: str, text2: str, split_num: int=3):
|
||||
"""
|
||||
Find the similarity between two strings using ngrams.
|
||||
Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
|
||||
|
||||
Arguments:
|
||||
text1: main string to compare against.
|
||||
text2: second string to compare to text1.
|
||||
split_num: the length the ngrams should be. Defaults to 3 (trigrams).
|
||||
"""
|
||||
try:
|
||||
ngrams1 = find_ngrams(text1, split_num)
|
||||
ngrams2 = find_ngrams(text2, split_num)
|
||||
|
||||
num_unique = len(ngrams1 | ngrams2)
|
||||
num_equal = len(ngrams1 & ngrams2)
|
||||
|
||||
score = round(float(num_equal) / float(num_unique), 6)
|
||||
|
||||
return score
|
||||
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def cosine(text1, text2):
|
||||
"""
|
||||
Find the similarity between two strings using cosine vectors.
|
||||
Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
|
||||
|
||||
Arguments:
|
||||
text1: main string to compare against.
|
||||
text2: second string to compare to text1.
|
||||
"""
|
||||
try:
|
||||
vec1 = Counter(re.compile(r"\w+").findall(text1))
|
||||
vec2 = Counter(re.compile(r"\w+").findall(text2))
|
||||
intersection = set(vec1.keys()) & set(vec2.keys())
|
||||
numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
||||
|
||||
sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
|
||||
sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
|
||||
denominator = math.sqrt(sum1) * math.sqrt(sum2)
|
||||
|
||||
if not denominator:
|
||||
return 0.0
|
||||
else:
|
||||
return float(numerator) / denominator
|
||||
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def levenshtein(text1, text2):
|
||||
"""
|
||||
Find the similarity between two strings using Levenshtein distance.
|
||||
Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
|
||||
|
||||
Arguments:
|
||||
text1: main string to compare against.
|
||||
text2: second string to compare to text1.
|
||||
"""
|
||||
try:
|
||||
size_x = len(text1) + 1
|
||||
size_y = len(text2) + 1
|
||||
matrix = np.zeros ((size_x, size_y))
|
||||
for x in range(size_x):
|
||||
matrix [x, 0] = x
|
||||
for y in range(size_y):
|
||||
matrix [0, y] = y
|
||||
|
||||
for x in range(1, size_x):
|
||||
for y in range(1, size_y):
|
||||
if text1[x-1] == text2[y-1]:
|
||||
matrix [x,y] = min(
|
||||
matrix[x-1, y] + 1,
|
||||
matrix[x-1, y-1],
|
||||
matrix[x, y-1] + 1
|
||||
)
|
||||
else:
|
||||
matrix [x,y] = min(
|
||||
matrix[x-1,y] + 1,
|
||||
matrix[x-1,y-1] + 1,
|
||||
matrix[x,y-1] + 1
|
||||
)
|
||||
distance = matrix[size_x - 1, size_y - 1]
|
||||
score = (max(len(text1), len(text2)) - distance) / max(len(text1), len(text2))
|
||||
return float(score)
|
||||
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def jaro_winkler(s1, s2):
|
||||
"""
|
||||
Find the similarity between two strings using Jaro-Winkler distance.
|
||||
Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
|
||||
|
||||
Arguments:
|
||||
text1: main string to compare against.
|
||||
text2: second string to compare to text1.
|
||||
"""
|
||||
try:
|
||||
if (s1 == s2):
|
||||
return 1.0
|
||||
|
||||
len1 = len(s1)
|
||||
len2 = len(s2)
|
||||
max_dist = floor(max(len1, len2) / 2) - 1
|
||||
match = 0
|
||||
hash_s1 = [0] * len(s1)
|
||||
hash_s2 = [0] * len(s2)
|
||||
|
||||
for i in range(len1):
|
||||
for j in range(max(0, i - max_dist),
|
||||
min(len2, i + max_dist + 1)):
|
||||
|
||||
if (s1[i] == s2[j] and hash_s2[j] == 0):
|
||||
hash_s1[i] = 1
|
||||
hash_s2[j] = 1
|
||||
match += 1
|
||||
break
|
||||
|
||||
if (match == 0):
|
||||
return 0.0
|
||||
|
||||
t = 0
|
||||
point = 0
|
||||
|
||||
for i in range(len1):
|
||||
if (hash_s1[i]):
|
||||
|
||||
while (hash_s2[point] == 0):
|
||||
point += 1
|
||||
|
||||
if (s1[i] != s2[point]):
|
||||
point += 1
|
||||
t += 1
|
||||
t = t//2
|
||||
|
||||
return float(match/ len1 + match / len2 +
|
||||
(match - t + 1) / match)/ 3.0
|
||||
except:
|
||||
return None
|
||||
78
lib/fuzzy_match/match.py
Normal file
78
lib/fuzzy_match/match.py
Normal file
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
import heapq
|
||||
from . import algorithims
|
||||
|
||||
|
||||
|
||||
def extract(query, choices, match_type='trigram', score_cutoff=0, limit=5):
|
||||
"""
|
||||
Find the similarity between a query item and a list of choices.
|
||||
Returns a tuple of all choices and their associated similarity score.
|
||||
|
||||
Arguments:
|
||||
query: The string you are wanting to match.
|
||||
choices: An iterable or dictionary-like object containing choices
|
||||
to be matched against the query.
|
||||
score_cutoff: Optional argument for score threshold. If the best
|
||||
match is found, but it is not greater than this number, then
|
||||
return None anyway ("not a good enough match"). Defaults to 0.
|
||||
|
||||
"""
|
||||
try:
|
||||
if match_type == 'trigram':
|
||||
match_type = algorithims.trigram
|
||||
elif match_type == 'levenshtein':
|
||||
match_type = algorithims.levenshtein
|
||||
elif match_type == 'cosine':
|
||||
match_type = algorithims.cosine
|
||||
elif match_type == 'jaro_winkler':
|
||||
match_type = algorithims.jaro_winkler
|
||||
try:
|
||||
if choices is None or len(choices) == 0:
|
||||
return
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
results = []
|
||||
|
||||
for i in choices:
|
||||
score = (match_type(query, i))
|
||||
data = (i, score)
|
||||
if score >= score_cutoff:
|
||||
results.append(data)
|
||||
|
||||
|
||||
return heapq.nlargest(limit, results, key=lambda i: i[1]) if limit is not None else \
|
||||
sorted(results, key=lambda i: i[1], reverse=True)
|
||||
|
||||
# return results
|
||||
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def extractOne(query, choices, match_type='trigram', score_cutoff=0):
|
||||
"""
|
||||
Finds the most similar item to query item from a list of choices.
|
||||
Returns tuple of best choice and its associated similarity score.
|
||||
|
||||
Arguments:
|
||||
query: The string you are wanting to match.
|
||||
choices: An iterable or dictionary-like object containing choices
|
||||
to be matched against the query.
|
||||
score_cutoff: Optional argument for score threshold. If the best
|
||||
match is found, but it is not greater than this number, then
|
||||
return None anyway ("not a good enough match"). Defaults to 0.
|
||||
|
||||
"""
|
||||
|
||||
try:
|
||||
best_list = extract(query, choices, match_type, score_cutoff)
|
||||
|
||||
best = max(best_list, key=lambda i: i[1])
|
||||
|
||||
return best
|
||||
|
||||
except:
|
||||
return None
|
||||
Reference in New Issue
Block a user