fuzzy matching su tmdb per un più preciso riconoscimento

2021-02-07 11:54:15 +01:00
parent 59ba64a5e0
commit a180136f8c
4 changed files with 268 additions and 0 deletions
--- a/core/tmdb.py
+++ b/core/tmdb.py
@@ -1018,6 +1018,11 @@ class Tmdb(object):
                    % (buscando, len(results), page, index_results))
                return 0

+            # We sort result based on fuzzy match to detect most similar
+            if len(results) > 1:
+                from lib.fuzzy_match import algorithims
+                results.sort(key=lambda r: algorithims.trigram(text_simple, r['title']), reverse=True)
+
            # We return the number of results of this page
            self.results = results
            self.total_results = total_results
--- a/lib/fuzzy_match/init.py
+++ b/lib/fuzzy_match/init.py
@@ -0,0 +1,4 @@
+# __init__.py
+
+# Version of the fuzzy-match package
+__version__ = "0.0.1"
--- a/lib/fuzzy_match/algorithims.py
+++ b/lib/fuzzy_match/algorithims.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+# encoding: utf-8
+import math
+from math import floor, ceil 
+import re
+from collections import Counter
+# import numpy as np
+
+
+
+def find_ngrams(string: str, split_num: int=3) -> set:
+    """
+    Slice string into ngrams.
+    Returns array of ngrams for the given string.
+
+    Arguments:
+        text: the string to find ngrams for.
+        split_num: the length the ngrams should be. Defaults to 3 (trigrams).
+    """
+    try:
+        if not string:
+            return set()
+
+        words = [f'  {x} ' for x in re.split(r'\W+', str(string).lower()) if x.strip()]
+
+        ngrams = set()
+
+        for word in words:
+            for x in range(0, len(word) - split_num + 1):
+                ngrams.add(word[x:x+split_num])
+
+        return ngrams
+
+    except:
+        return None
+
+
+def trigram(text1: str, text2: str, split_num: int=3):
+    """
+    Find the similarity between two strings using ngrams.
+    Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
+
+    Arguments:
+        text1: main string to compare against.
+        text2: second string to compare to text1.
+        split_num: the length the ngrams should be. Defaults to 3 (trigrams).
+    """
+    try:
+        ngrams1 = find_ngrams(text1, split_num)
+        ngrams2 = find_ngrams(text2, split_num)
+
+        num_unique = len(ngrams1 | ngrams2)
+        num_equal = len(ngrams1 & ngrams2)
+
+        score = round(float(num_equal) / float(num_unique), 6)
+
+        return score
+
+    except:
+        return None
+
+
+def cosine(text1, text2):
+    """
+    Find the similarity between two strings using cosine vectors.
+    Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
+
+    Arguments:
+        text1: main string to compare against.
+        text2: second string to compare to text1.
+    """
+    try:
+        vec1 = Counter(re.compile(r"\w+").findall(text1))
+        vec2 = Counter(re.compile(r"\w+").findall(text2))
+        intersection = set(vec1.keys()) & set(vec2.keys())
+        numerator = sum([vec1[x] * vec2[x] for x in intersection])
+
+        sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
+        sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
+        denominator = math.sqrt(sum1) * math.sqrt(sum2)
+
+        if not denominator:
+            return 0.0
+        else:
+            return float(numerator) / denominator
+
+    except:
+        return None
+
+
+def levenshtein(text1, text2):
+    """
+    Find the similarity between two strings using Levenshtein distance.
+    Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
+
+    Arguments:
+        text1: main string to compare against.
+        text2: second string to compare to text1.
+    """
+    try:
+        size_x = len(text1) + 1
+        size_y = len(text2) + 1
+        matrix = np.zeros ((size_x, size_y))
+        for x in range(size_x):
+            matrix [x, 0] = x
+        for y in range(size_y):
+            matrix [0, y] = y
+
+        for x in range(1, size_x):
+            for y in range(1, size_y):
+                if text1[x-1] == text2[y-1]:
+                    matrix [x,y] = min(
+                        matrix[x-1, y] + 1,
+                        matrix[x-1, y-1],
+                        matrix[x, y-1] + 1
+                    )
+                else:
+                    matrix [x,y] = min(
+                        matrix[x-1,y] + 1,
+                        matrix[x-1,y-1] + 1,
+                        matrix[x,y-1] + 1
+                    )
+        distance = matrix[size_x - 1, size_y - 1]
+        score = (max(len(text1), len(text2)) - distance) / max(len(text1), len(text2))
+        return float(score)
+
+    except:
+        return None
+
+
+def jaro_winkler(s1, s2):
+    """
+    Find the similarity between two strings using Jaro-Winkler distance.
+    Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
+
+    Arguments:
+        text1: main string to compare against.
+        text2: second string to compare to text1.
+    """
+    try:
+        if (s1 == s2): 
+            return 1.0
+    
+        len1 = len(s1) 
+        len2 = len(s2) 
+        max_dist = floor(max(len1, len2) / 2) - 1
+        match = 0
+        hash_s1 = [0] * len(s1) 
+        hash_s2 = [0] * len(s2) 
+
+        for i in range(len1): 
+            for j in range(max(0, i - max_dist),  
+                        min(len2, i + max_dist + 1)): 
+                
+                if (s1[i] == s2[j] and hash_s2[j] == 0): 
+                    hash_s1[i] = 1
+                    hash_s2[j] = 1
+                    match += 1
+                    break
+
+        if (match == 0): 
+            return 0.0
+    
+        t = 0
+        point = 0
+
+        for i in range(len1): 
+            if (hash_s1[i]): 
+     
+                while (hash_s2[point] == 0): 
+                    point += 1
+    
+                if (s1[i] != s2[point]): 
+                    point += 1
+                    t += 1
+        t = t//2
+    
+        return float(match/ len1 + match / len2 + 
+                (match - t + 1) / match)/ 3.0
+    except:
+        return None
--- a/lib/fuzzy_match/match.py
+++ b/lib/fuzzy_match/match.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# encoding: utf-8
+import heapq
+from . import algorithims
+
+
+
+def extract(query, choices, match_type='trigram', score_cutoff=0, limit=5):
+    """
+    Find the similarity between a query item and a list of choices.
+    Returns a tuple of all choices and their associated similarity score.
+
+     Arguments:
+        query: The string you are wanting to match.
+        choices: An iterable or dictionary-like object containing choices
+            to be matched against the query.
+        score_cutoff: Optional argument for score threshold. If the best
+            match is found, but it is not greater than this number, then
+            return None anyway ("not a good enough match").  Defaults to 0.
+
+    """
+    try:
+        if match_type == 'trigram':
+            match_type = algorithims.trigram
+        elif match_type == 'levenshtein':
+            match_type = algorithims.levenshtein
+        elif match_type == 'cosine':
+            match_type = algorithims.cosine
+        elif match_type == 'jaro_winkler':
+            match_type = algorithims.jaro_winkler
+        try:
+            if choices is None or len(choices) == 0:
+                return
+        except TypeError:
+            pass
+
+        results = []
+        
+        for i in choices:
+            score = (match_type(query, i))
+            data = (i, score)
+            if score >= score_cutoff:
+                results.append(data)
+
+
+        return heapq.nlargest(limit, results, key=lambda i: i[1]) if limit is not None else \
+            sorted(results, key=lambda i: i[1], reverse=True)
+
+        # return results
+
+    except:
+        return None
+
+
+def extractOne(query, choices, match_type='trigram', score_cutoff=0):
+    """
+    Finds the most similar item to query item from a list of choices.
+    Returns tuple of best choice and its associated similarity score.
+
+     Arguments:
+        query: The string you are wanting to match.
+        choices: An iterable or dictionary-like object containing choices
+            to be matched against the query.
+        score_cutoff: Optional argument for score threshold. If the best
+            match is found, but it is not greater than this number, then
+            return None anyway ("not a good enough match"). Defaults to 0.
+
+    """
+
+    try:
+        best_list = extract(query, choices, match_type, score_cutoff)
+
+        best = max(best_list, key=lambda i: i[1])
+
+        return best
+
+    except:
+        return None