From a180136f8ccc9fc0fe373c7c3774aa18ade4688c Mon Sep 17 00:00:00 2001 From: mac12m99 <10120390+mac12m99@users.noreply.github.com> Date: Sun, 7 Feb 2021 11:54:15 +0100 Subject: [PATCH] =?UTF-8?q?fuzzy=20matching=20su=20tmdb=20per=20un=20pi?= =?UTF-8?q?=C3=B9=20preciso=20riconoscimento?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/tmdb.py | 5 + lib/fuzzy_match/__init__.py | 4 + lib/fuzzy_match/algorithims.py | 181 +++++++++++++++++++++++++++++++++ lib/fuzzy_match/match.py | 78 ++++++++++++++ 4 files changed, 268 insertions(+) create mode 100644 lib/fuzzy_match/__init__.py create mode 100644 lib/fuzzy_match/algorithims.py create mode 100644 lib/fuzzy_match/match.py diff --git a/core/tmdb.py b/core/tmdb.py index 95388910..d79e756e 100644 --- a/core/tmdb.py +++ b/core/tmdb.py @@ -1018,6 +1018,11 @@ class Tmdb(object): % (buscando, len(results), page, index_results)) return 0 + # We sort result based on fuzzy match to detect most similar + if len(results) > 1: + from lib.fuzzy_match import algorithims + results.sort(key=lambda r: algorithims.trigram(text_simple, r['title']), reverse=True) + # We return the number of results of this page self.results = results self.total_results = total_results diff --git a/lib/fuzzy_match/__init__.py b/lib/fuzzy_match/__init__.py new file mode 100644 index 00000000..84e0db87 --- /dev/null +++ b/lib/fuzzy_match/__init__.py @@ -0,0 +1,4 @@ +# __init__.py + +# Version of the fuzzy-match package +__version__ = "0.0.1" \ No newline at end of file diff --git a/lib/fuzzy_match/algorithims.py b/lib/fuzzy_match/algorithims.py new file mode 100644 index 00000000..d0e7aa2e --- /dev/null +++ b/lib/fuzzy_match/algorithims.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python +# encoding: utf-8 +import math +from math import floor, ceil +import re +from collections import Counter +# import numpy as np + + + +def find_ngrams(string: str, split_num: int=3) -> set: + """ + Slice string into ngrams. + Returns array of ngrams for the given string. + + Arguments: + text: the string to find ngrams for. + split_num: the length the ngrams should be. Defaults to 3 (trigrams). + """ + try: + if not string: + return set() + + words = [f' {x} ' for x in re.split(r'\W+', str(string).lower()) if x.strip()] + + ngrams = set() + + for word in words: + for x in range(0, len(word) - split_num + 1): + ngrams.add(word[x:x+split_num]) + + return ngrams + + except: + return None + + +def trigram(text1: str, text2: str, split_num: int=3): + """ + Find the similarity between two strings using ngrams. + Returns float score value, 0.0 being completely different strings and 1.0 being equal strings. + + Arguments: + text1: main string to compare against. + text2: second string to compare to text1. + split_num: the length the ngrams should be. Defaults to 3 (trigrams). + """ + try: + ngrams1 = find_ngrams(text1, split_num) + ngrams2 = find_ngrams(text2, split_num) + + num_unique = len(ngrams1 | ngrams2) + num_equal = len(ngrams1 & ngrams2) + + score = round(float(num_equal) / float(num_unique), 6) + + return score + + except: + return None + + +def cosine(text1, text2): + """ + Find the similarity between two strings using cosine vectors. + Returns float score value, 0.0 being completely different strings and 1.0 being equal strings. + + Arguments: + text1: main string to compare against. + text2: second string to compare to text1. + """ + try: + vec1 = Counter(re.compile(r"\w+").findall(text1)) + vec2 = Counter(re.compile(r"\w+").findall(text2)) + intersection = set(vec1.keys()) & set(vec2.keys()) + numerator = sum([vec1[x] * vec2[x] for x in intersection]) + + sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())]) + sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())]) + denominator = math.sqrt(sum1) * math.sqrt(sum2) + + if not denominator: + return 0.0 + else: + return float(numerator) / denominator + + except: + return None + + +def levenshtein(text1, text2): + """ + Find the similarity between two strings using Levenshtein distance. + Returns float score value, 0.0 being completely different strings and 1.0 being equal strings. + + Arguments: + text1: main string to compare against. + text2: second string to compare to text1. + """ + try: + size_x = len(text1) + 1 + size_y = len(text2) + 1 + matrix = np.zeros ((size_x, size_y)) + for x in range(size_x): + matrix [x, 0] = x + for y in range(size_y): + matrix [0, y] = y + + for x in range(1, size_x): + for y in range(1, size_y): + if text1[x-1] == text2[y-1]: + matrix [x,y] = min( + matrix[x-1, y] + 1, + matrix[x-1, y-1], + matrix[x, y-1] + 1 + ) + else: + matrix [x,y] = min( + matrix[x-1,y] + 1, + matrix[x-1,y-1] + 1, + matrix[x,y-1] + 1 + ) + distance = matrix[size_x - 1, size_y - 1] + score = (max(len(text1), len(text2)) - distance) / max(len(text1), len(text2)) + return float(score) + + except: + return None + + +def jaro_winkler(s1, s2): + """ + Find the similarity between two strings using Jaro-Winkler distance. + Returns float score value, 0.0 being completely different strings and 1.0 being equal strings. + + Arguments: + text1: main string to compare against. + text2: second string to compare to text1. + """ + try: + if (s1 == s2): + return 1.0 + + len1 = len(s1) + len2 = len(s2) + max_dist = floor(max(len1, len2) / 2) - 1 + match = 0 + hash_s1 = [0] * len(s1) + hash_s2 = [0] * len(s2) + + for i in range(len1): + for j in range(max(0, i - max_dist), + min(len2, i + max_dist + 1)): + + if (s1[i] == s2[j] and hash_s2[j] == 0): + hash_s1[i] = 1 + hash_s2[j] = 1 + match += 1 + break + + if (match == 0): + return 0.0 + + t = 0 + point = 0 + + for i in range(len1): + if (hash_s1[i]): + + while (hash_s2[point] == 0): + point += 1 + + if (s1[i] != s2[point]): + point += 1 + t += 1 + t = t//2 + + return float(match/ len1 + match / len2 + + (match - t + 1) / match)/ 3.0 + except: + return None \ No newline at end of file diff --git a/lib/fuzzy_match/match.py b/lib/fuzzy_match/match.py new file mode 100644 index 00000000..c01480ae --- /dev/null +++ b/lib/fuzzy_match/match.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# encoding: utf-8 +import heapq +from . import algorithims + + + +def extract(query, choices, match_type='trigram', score_cutoff=0, limit=5): + """ + Find the similarity between a query item and a list of choices. + Returns a tuple of all choices and their associated similarity score. + + Arguments: + query: The string you are wanting to match. + choices: An iterable or dictionary-like object containing choices + to be matched against the query. + score_cutoff: Optional argument for score threshold. If the best + match is found, but it is not greater than this number, then + return None anyway ("not a good enough match"). Defaults to 0. + + """ + try: + if match_type == 'trigram': + match_type = algorithims.trigram + elif match_type == 'levenshtein': + match_type = algorithims.levenshtein + elif match_type == 'cosine': + match_type = algorithims.cosine + elif match_type == 'jaro_winkler': + match_type = algorithims.jaro_winkler + try: + if choices is None or len(choices) == 0: + return + except TypeError: + pass + + results = [] + + for i in choices: + score = (match_type(query, i)) + data = (i, score) + if score >= score_cutoff: + results.append(data) + + + return heapq.nlargest(limit, results, key=lambda i: i[1]) if limit is not None else \ + sorted(results, key=lambda i: i[1], reverse=True) + + # return results + + except: + return None + + +def extractOne(query, choices, match_type='trigram', score_cutoff=0): + """ + Finds the most similar item to query item from a list of choices. + Returns tuple of best choice and its associated similarity score. + + Arguments: + query: The string you are wanting to match. + choices: An iterable or dictionary-like object containing choices + to be matched against the query. + score_cutoff: Optional argument for score threshold. If the best + match is found, but it is not greater than this number, then + return None anyway ("not a good enough match"). Defaults to 0. + + """ + + try: + best_list = extract(query, choices, match_type, score_cutoff) + + best = max(best_list, key=lambda i: i[1]) + + return best + + except: + return None \ No newline at end of file