Files
addon/lib/fuzzy_match/algorithims.py
mac12m99 748fad7431 KoD 1.6
- rimosso supporto a TVDB (l'accesso alle API diventerà a pagamento)
- aggiunto canale Discovery+
- aggiunta possibilità di scegliere numerazioni alternative per le serie tv
- migliorie interne di vario tipo (tra cui un migliore riconoscimento dei contenuti nel caso siano scritti male)
2021-02-13 16:37:02 +01:00

181 lines
5.0 KiB
Python

#!/usr/bin/env python
# encoding: utf-8
import math
from math import floor, ceil
import re
from collections import Counter
# import numpy as np
def find_ngrams(string, split_num=3):
"""
Slice string into ngrams.
Returns array of ngrams for the given string.
Arguments:
text: the string to find ngrams for.
split_num: the length the ngrams should be. Defaults to 3 (trigrams).
"""
try:
if not string:
return set()
words = [' {} '.format(x) for x in re.split(r'\W+', str(string).lower()) if x.strip()]
ngrams = set()
for word in words:
for x in range(0, len(word) - split_num + 1):
ngrams.add(word[x:x+split_num])
return ngrams
except:
return None
def trigram(text1, text2, split_num=3):
"""
Find the similarity between two strings using ngrams.
Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
Arguments:
text1: main string to compare against.
text2: second string to compare to text1.
split_num: the length the ngrams should be. Defaults to 3 (trigrams).
"""
try:
ngrams1 = find_ngrams(text1, split_num)
ngrams2 = find_ngrams(text2, split_num)
num_unique = len(ngrams1 | ngrams2)
num_equal = len(ngrams1 & ngrams2)
score = round(float(num_equal) / float(num_unique), 6)
return score
except:
return None
def cosine(text1, text2):
"""
Find the similarity between two strings using cosine vectors.
Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
Arguments:
text1: main string to compare against.
text2: second string to compare to text1.
"""
try:
vec1 = Counter(re.compile(r"\w+").findall(text1))
vec2 = Counter(re.compile(r"\w+").findall(text2))
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
except:
return None
def levenshtein(text1, text2):
"""
Find the similarity between two strings using Levenshtein distance.
Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
Arguments:
text1: main string to compare against.
text2: second string to compare to text1.
"""
try:
size_x = len(text1) + 1
size_y = len(text2) + 1
matrix = np.zeros ((size_x, size_y))
for x in range(size_x):
matrix [x, 0] = x
for y in range(size_y):
matrix [0, y] = y
for x in range(1, size_x):
for y in range(1, size_y):
if text1[x-1] == text2[y-1]:
matrix [x,y] = min(
matrix[x-1, y] + 1,
matrix[x-1, y-1],
matrix[x, y-1] + 1
)
else:
matrix [x,y] = min(
matrix[x-1,y] + 1,
matrix[x-1,y-1] + 1,
matrix[x,y-1] + 1
)
distance = matrix[size_x - 1, size_y - 1]
score = (max(len(text1), len(text2)) - distance) / max(len(text1), len(text2))
return float(score)
except:
return None
def jaro_winkler(s1, s2):
"""
Find the similarity between two strings using Jaro-Winkler distance.
Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.
Arguments:
text1: main string to compare against.
text2: second string to compare to text1.
"""
try:
if (s1 == s2):
return 1.0
len1 = len(s1)
len2 = len(s2)
max_dist = floor(max(len1, len2) / 2) - 1
match = 0
hash_s1 = [0] * len(s1)
hash_s2 = [0] * len(s2)
for i in range(len1):
for j in range(max(0, i - max_dist),
min(len2, i + max_dist + 1)):
if (s1[i] == s2[j] and hash_s2[j] == 0):
hash_s1[i] = 1
hash_s2[j] = 1
match += 1
break
if (match == 0):
return 0.0
t = 0
point = 0
for i in range(len1):
if (hash_s1[i]):
while (hash_s2[point] == 0):
point += 1
if (s1[i] != s2[point]):
point += 1
t += 1
t = t//2
return float(match/ len1 + match / len2 +
(match - t + 1) / match)/ 3.0
except:
return None