fuzzy matching su tmdb per un più preciso riconoscimento

2021-02-07 11:54:15 +01:00
parent 59ba64a5e0
commit a180136f8c
4 changed files with 268 additions and 0 deletions
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# encoding: utf-8
+import heapq
+from . import algorithims
+
+
+
+def extract(query, choices, match_type='trigram', score_cutoff=0, limit=5):
+    """
+    Find the similarity between a query item and a list of choices.
+    Returns a tuple of all choices and their associated similarity score.
+
+     Arguments:
+        query: The string you are wanting to match.
+        choices: An iterable or dictionary-like object containing choices
+            to be matched against the query.
+        score_cutoff: Optional argument for score threshold. If the best
+            match is found, but it is not greater than this number, then
+            return None anyway ("not a good enough match").  Defaults to 0.
+
+    """
+    try:
+        if match_type == 'trigram':
+            match_type = algorithims.trigram
+        elif match_type == 'levenshtein':
+            match_type = algorithims.levenshtein
+        elif match_type == 'cosine':
+            match_type = algorithims.cosine
+        elif match_type == 'jaro_winkler':
+            match_type = algorithims.jaro_winkler
+        try:
+            if choices is None or len(choices) == 0:
+                return
+        except TypeError:
+            pass
+
+        results = []
+        
+        for i in choices:
+            score = (match_type(query, i))
+            data = (i, score)
+            if score >= score_cutoff:
+                results.append(data)
+
+
+        return heapq.nlargest(limit, results, key=lambda i: i[1]) if limit is not None else \
+            sorted(results, key=lambda i: i[1], reverse=True)
+
+        # return results
+
+    except:
+        return None
+
+
+def extractOne(query, choices, match_type='trigram', score_cutoff=0):
+    """
+    Finds the most similar item to query item from a list of choices.
+    Returns tuple of best choice and its associated similarity score.
+
+     Arguments:
+        query: The string you are wanting to match.
+        choices: An iterable or dictionary-like object containing choices
+            to be matched against the query.
+        score_cutoff: Optional argument for score threshold. If the best
+            match is found, but it is not greater than this number, then
+            return None anyway ("not a good enough match"). Defaults to 0.
+
+    """
+
+    try:
+        best_list = extract(query, choices, match_type, score_cutoff)
+
+        best = max(best_list, key=lambda i: i[1])
+
+        return best
+
+    except:
+        return None