This commit is contained in:
marco
2020-05-01 18:03:37 +02:00
parent 1152befcf1
commit e60525fee1
95 changed files with 306 additions and 21444 deletions
-15
View File
@@ -1,15 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Common module
"""
import re
seps = r' [](){}+*|=-_~#/\\.,;:' # list of tags/words separators
seps_no_groups = seps.replace('[](){}', '')
seps_no_fs = seps.replace('/', '').replace('\\', '')
title_seps = r'-+/\|' # separators for title
dash = (r'-', r'['+re.escape(seps_no_fs)+']') # abbreviation used by many rebulk objects.
alt_dash = (r'@', r'['+re.escape(seps_no_fs)+']') # abbreviation used by many rebulk objects.
-75
View File
@@ -1,75 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Comparators
"""
try:
from functools import cmp_to_key
except ImportError:
from ...backports import cmp_to_key
def marker_comparator_predicate(match):
"""
Match predicate used in comparator
"""
return (
not match.private
and match.name not in ('proper_count', 'title')
and not (match.name == 'container' and 'extension' in match.tags)
and not (match.name == 'other' and match.value == 'Rip')
)
def marker_weight(matches, marker, predicate):
"""
Compute the comparator weight of a marker
:param matches:
:param marker:
:param predicate:
:return:
"""
return len(set(match.name for match in matches.range(*marker.span, predicate=predicate)))
def marker_comparator(matches, markers, predicate):
"""
Builds a comparator that returns markers sorted from the most valuable to the less.
Take the parts where matches count is higher, then when length is higher, then when position is at left.
:param matches:
:type matches:
:param markers:
:param predicate:
:return:
:rtype:
"""
def comparator(marker1, marker2):
"""
The actual comparator function.
"""
matches_count = marker_weight(matches, marker2, predicate) - marker_weight(matches, marker1, predicate)
if matches_count:
return matches_count
# give preference to rightmost path
return markers.index(marker2) - markers.index(marker1)
return comparator
def marker_sorted(markers, matches, predicate=marker_comparator_predicate):
"""
Sort markers from matches, from the most valuable to the less.
:param markers:
:type markers:
:param matches:
:type matches:
:param predicate:
:return:
:rtype:
"""
return sorted(markers, key=cmp_to_key(marker_comparator(matches, markers, predicate=predicate)))
-125
View File
@@ -1,125 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Date
"""
from dateutil import parser
from rebulk.remodule import re
_dsep = r'[-/ \.]'
_dsep_bis = r'[-/ \.x]'
date_regexps = [
re.compile(r'%s((\d{8}))%s' % (_dsep, _dsep), re.IGNORECASE),
re.compile(r'%s((\d{6}))%s' % (_dsep, _dsep), re.IGNORECASE),
re.compile(r'(?:^|[^\d])((\d{2})%s(\d{1,2})%s(\d{1,2}))(?:$|[^\d])' % (_dsep, _dsep), re.IGNORECASE),
re.compile(r'(?:^|[^\d])((\d{1,2})%s(\d{1,2})%s(\d{2}))(?:$|[^\d])' % (_dsep, _dsep), re.IGNORECASE),
re.compile(r'(?:^|[^\d])((\d{4})%s(\d{1,2})%s(\d{1,2}))(?:$|[^\d])' % (_dsep_bis, _dsep), re.IGNORECASE),
re.compile(r'(?:^|[^\d])((\d{1,2})%s(\d{1,2})%s(\d{4}))(?:$|[^\d])' % (_dsep, _dsep_bis), re.IGNORECASE),
re.compile(r'(?:^|[^\d])((\d{1,2}(?:st|nd|rd|th)?%s(?:[a-z]{3,10})%s\d{4}))(?:$|[^\d])' % (_dsep, _dsep),
re.IGNORECASE)]
def valid_year(year):
"""Check if number is a valid year"""
return 1920 <= year < 2030
def _is_int(string):
"""
Check if the input string is an integer
:param string:
:type string:
:return:
:rtype:
"""
try:
int(string)
return True
except ValueError:
return False
def _guess_day_first_parameter(groups): # pylint:disable=inconsistent-return-statements
"""
If day_first is not defined, use some heuristic to fix it.
It helps to solve issues with python dateutils 2.5.3 parser changes.
:param groups: match groups found for the date
:type groups: list of match objects
:return: day_first option guessed value
:rtype: bool
"""
# If match starts with a long year, then day_first is force to false.
if _is_int(groups[0]) and valid_year(int(groups[0][:4])):
return False
# If match ends with a long year, the day_first is forced to true.
if _is_int(groups[-1]) and valid_year(int(groups[-1][-4:])):
return True
# If match starts with a short year, then day_first is force to false.
if _is_int(groups[0]) and int(groups[0][:2]) > 31:
return False
# If match ends with a short year, then day_first is force to true.
if _is_int(groups[-1]) and int(groups[-1][-2:]) > 31:
return True
def search_date(string, year_first=None, day_first=None): # pylint:disable=inconsistent-return-statements
"""Looks for date patterns, and if found return the date and group span.
Assumes there are sentinels at the beginning and end of the string that
always allow matching a non-digit delimiting the date.
Year can be defined on two digit only. It will return the nearest possible
date from today.
>>> search_date(' This happened on 2002-04-22. ')
(18, 28, datetime.date(2002, 4, 22))
>>> search_date(' And this on 17-06-1998. ')
(13, 23, datetime.date(1998, 6, 17))
>>> search_date(' no date in here ')
"""
for date_re in date_regexps:
search_match = date_re.search(string)
if not search_match:
continue
start, end = search_match.start(1), search_match.end(1)
groups = search_match.groups()[1:]
match = '-'.join(groups)
if match is None:
continue
if year_first and day_first is None:
day_first = False
if day_first is None:
day_first = _guess_day_first_parameter(groups)
# If day_first/year_first is undefined, parse is made using both possible values.
yearfirst_opts = [False, True]
if year_first is not None:
yearfirst_opts = [year_first]
dayfirst_opts = [True, False]
if day_first is not None:
dayfirst_opts = [day_first]
kwargs_list = ({'dayfirst': d, 'yearfirst': y}
for d in dayfirst_opts for y in yearfirst_opts)
for kwargs in kwargs_list:
try:
date = parser.parse(match, **kwargs)
except (ValueError, TypeError): # pragma: no cover
# see https://bugs.launchpad.net/dateutil/+bug/1247643
date = None
# check date plausibility
if date and valid_year(date.year): # pylint:disable=no-member
return start, end, date.date() # pylint:disable=no-member
-53
View File
@@ -1,53 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Expected property factory
"""
import re
from rebulk import Rebulk
from rebulk.utils import find_all
from . import dash, seps
def build_expected_function(context_key):
"""
Creates a expected property function
:param context_key:
:type context_key:
:param cleanup:
:type cleanup:
:return:
:rtype:
"""
def expected(input_string, context):
"""
Expected property functional pattern.
:param input_string:
:type input_string:
:param context:
:type context:
:return:
:rtype:
"""
ret = []
for search in context.get(context_key):
if search.startswith('re:'):
search = search[3:]
search = search.replace(' ', '-')
matches = Rebulk().regex(search, abbreviations=[dash], flags=re.IGNORECASE) \
.matches(input_string, context)
for match in matches:
ret.append(match.span)
else:
value = search
for sep in seps:
input_string = input_string.replace(sep, ' ')
search = search.replace(sep, ' ')
for start in find_all(input_string, search, ignore_case=True):
ret.append({'start': start, 'end': start + len(search), 'value': value})
return ret
return expected
-136
View File
@@ -1,136 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Formatters
"""
from rebulk.formatters import formatters
from rebulk.remodule import re
from . import seps
_excluded_clean_chars = ',:;-/\\'
clean_chars = ""
for sep in seps:
if sep not in _excluded_clean_chars:
clean_chars += sep
def _potential_before(i, input_string):
"""
Check if the character at position i can be a potential single char separator considering what's before it.
:param i:
:type i: int
:param input_string:
:type input_string: str
:return:
:rtype: bool
"""
return i - 1 >= 0 and input_string[i] in seps and input_string[i - 2] in seps and input_string[i - 1] not in seps
def _potential_after(i, input_string):
"""
Check if the character at position i can be a potential single char separator considering what's after it.
:param i:
:type i: int
:param input_string:
:type input_string: str
:return:
:rtype: bool
"""
return i + 2 >= len(input_string) or \
input_string[i + 2] == input_string[i] and input_string[i + 1] not in seps
def cleanup(input_string):
"""
Removes and strip separators from input_string (but keep ',;' characters)
It also keep separators for single characters (Mavels Agents of S.H.I.E.L.D.)
:param input_string:
:type input_string: str
:return:
:rtype:
"""
clean_string = input_string
for char in clean_chars:
clean_string = clean_string.replace(char, ' ')
# Restore input separator if they separate single characters.
# Useful for Mavels Agents of S.H.I.E.L.D.
# https://github.com/guessit-io/guessit/issues/278
indices = [i for i, letter in enumerate(clean_string) if letter in seps]
dots = set()
if indices:
clean_list = list(clean_string)
potential_indices = []
for i in indices:
if _potential_before(i, input_string) and _potential_after(i, input_string):
potential_indices.append(i)
replace_indices = []
for potential_index in potential_indices:
if potential_index - 2 in potential_indices or potential_index + 2 in potential_indices:
replace_indices.append(potential_index)
if replace_indices:
for replace_index in replace_indices:
dots.add(input_string[replace_index])
clean_list[replace_index] = input_string[replace_index]
clean_string = ''.join(clean_list)
clean_string = strip(clean_string, ''.join([c for c in seps if c not in dots]))
clean_string = re.sub(' +', ' ', clean_string)
return clean_string
def strip(input_string, chars=seps):
"""
Strip separators from input_string
:param input_string:
:param chars:
:type input_string:
:return:
:rtype:
"""
return input_string.strip(chars)
def raw_cleanup(raw):
"""
Cleanup a raw value to perform raw comparison
:param raw:
:type raw:
:return:
:rtype:
"""
return formatters(cleanup, strip)(raw.lower())
def reorder_title(title, articles=('the',), separators=(',', ', ')):
"""
Reorder the title
:param title:
:type title:
:param articles:
:type articles:
:param separators:
:type separators:
:return:
:rtype:
"""
ltitle = title.lower()
for article in articles:
for separator in separators:
suffix = separator + article
if ltitle[-len(suffix):] == suffix:
return title[-len(suffix) + len(separator):] + ' ' + title[:-len(suffix)]
return title
-165
View File
@@ -1,165 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
parse numeral from various formats
"""
from rebulk.remodule import re
digital_numeral = r'\d{1,4}'
roman_numeral = r'(?=[MCDLXVI]+)M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})'
english_word_numeral_list = [
'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty'
]
french_word_numeral_list = [
'zéro', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix',
'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf', 'vingt'
]
french_alt_word_numeral_list = [
'zero', 'une', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix',
'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dixsept', 'dixhuit', 'dixneuf', 'vingt'
]
def __build_word_numeral(*args):
"""
Build word numeral regexp from list.
:param args:
:type args:
:param kwargs:
:type kwargs:
:return:
:rtype:
"""
re_ = None
for word_list in args:
for word in word_list:
if not re_:
re_ = r'(?:(?=\w+)'
else:
re_ += '|'
re_ += word
re_ += ')'
return re_
word_numeral = __build_word_numeral(english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list)
numeral = '(?:' + digital_numeral + '|' + roman_numeral + '|' + word_numeral + ')'
__romanNumeralMap = (
('M', 1000),
('CM', 900),
('D', 500),
('CD', 400),
('C', 100),
('XC', 90),
('L', 50),
('XL', 40),
('X', 10),
('IX', 9),
('V', 5),
('IV', 4),
('I', 1)
)
__romanNumeralPattern = re.compile('^' + roman_numeral + '$')
def __parse_roman(value):
"""
convert Roman numeral to integer
:param value: Value to parse
:type value: string
:return:
:rtype:
"""
if not __romanNumeralPattern.search(value):
raise ValueError('Invalid Roman numeral: %s' % value)
result = 0
index = 0
for num, integer in __romanNumeralMap:
while value[index:index + len(num)] == num:
result += integer
index += len(num)
return result
def __parse_word(value):
"""
Convert Word numeral to integer
:param value: Value to parse
:type value: string
:return:
:rtype:
"""
for word_list in [english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list]:
try:
return word_list.index(value.lower())
except ValueError:
pass
raise ValueError # pragma: no cover
_clean_re = re.compile(r'[^\d]*(\d+)[^\d]*')
def parse_numeral(value, int_enabled=True, roman_enabled=True, word_enabled=True, clean=True):
"""
Parse a numeric value into integer.
:param value: Value to parse. Can be an integer, roman numeral or word.
:type value: string
:param int_enabled:
:type int_enabled:
:param roman_enabled:
:type roman_enabled:
:param word_enabled:
:type word_enabled:
:param clean:
:type clean:
:return: Numeric value, or None if value can't be parsed
:rtype: int
"""
# pylint: disable=too-many-branches
if int_enabled:
try:
if clean:
match = _clean_re.match(value)
if match:
clean_value = match.group(1)
return int(clean_value)
return int(value)
except ValueError:
pass
if roman_enabled:
try:
if clean:
for word in value.split():
try:
return __parse_roman(word.upper())
except ValueError:
pass
return __parse_roman(value)
except ValueError:
pass
if word_enabled:
try:
if clean:
for word in value.split():
try:
return __parse_word(word)
except ValueError: # pragma: no cover
pass
return __parse_word(value) # pragma: no cover
except ValueError: # pragma: no cover
pass
raise ValueError('Invalid numeral: ' + value) # pragma: no cover
-27
View File
@@ -1,27 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Pattern utility functions
"""
def is_disabled(context, name):
"""Whether a specific pattern is disabled.
The context object might define an inclusion list (includes) or an exclusion list (excludes)
A pattern is considered disabled if it's found in the exclusion list or
it's not found in the inclusion list and the inclusion list is not empty or not defined.
:param context:
:param name:
:return:
"""
if not context:
return False
excludes = context.get('excludes')
if excludes and name in excludes:
return True
includes = context.get('includes')
return includes and name not in includes
-106
View File
@@ -1,106 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Quantities: Size
"""
import re
from abc import abstractmethod
import six
from ..common import seps
class Quantity(object):
"""
Represent a quantity object with magnitude and units.
"""
parser_re = re.compile(r'(?P<magnitude>\d+(?:[.]\d+)?)(?P<units>[^\d]+)?')
def __init__(self, magnitude, units):
self.magnitude = magnitude
self.units = units
@classmethod
@abstractmethod
def parse_units(cls, value):
"""
Parse a string to a proper unit notation.
"""
raise NotImplementedError
@classmethod
def fromstring(cls, string):
"""
Parse the string into a quantity object.
:param string:
:return:
"""
values = cls.parser_re.match(string).groupdict()
try:
magnitude = int(values['magnitude'])
except ValueError:
magnitude = float(values['magnitude'])
units = cls.parse_units(values['units'])
return cls(magnitude, units)
def __hash__(self):
return hash(str(self))
def __eq__(self, other):
if isinstance(other, six.string_types):
return str(self) == other
if not isinstance(other, self.__class__):
return NotImplemented
return self.magnitude == other.magnitude and self.units == other.units
def __ne__(self, other):
return not self == other
def __repr__(self):
return '<{0} [{1}]>'.format(self.__class__.__name__, self)
def __str__(self):
return '{0}{1}'.format(self.magnitude, self.units)
class Size(Quantity):
"""
Represent size.
e.g.: 1.1GB, 300MB
"""
@classmethod
def parse_units(cls, value):
return value.strip(seps).upper()
class BitRate(Quantity):
"""
Represent bit rate.
e.g.: 320Kbps, 1.5Mbps
"""
@classmethod
def parse_units(cls, value):
value = value.strip(seps).capitalize()
for token in ('bits', 'bit'):
value = value.replace(token, 'bps')
return value
class FrameRate(Quantity):
"""
Represent frame rate.
e.g.: 24fps, 60fps
"""
@classmethod
def parse_units(cls, value):
return 'fps'
-74
View File
@@ -1,74 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Validators
"""
from functools import partial
from rebulk.validators import chars_before, chars_after, chars_surround
from . import seps
seps_before = partial(chars_before, seps)
seps_after = partial(chars_after, seps)
seps_surround = partial(chars_surround, seps)
def int_coercable(string):
"""
Check if string can be coerced to int
:param string:
:type string:
:return:
:rtype:
"""
try:
int(string)
return True
except ValueError:
return False
def and_(*validators):
"""
Compose validators functions
:param validators:
:type validators:
:return:
:rtype:
"""
def composed(string):
"""
Composed validators function
:param string:
:type string:
:return:
:rtype:
"""
for validator in validators:
if not validator(string):
return False
return True
return composed
def or_(*validators):
"""
Compose validators functions
:param validators:
:type validators:
:return:
:rtype:
"""
def composed(string):
"""
Composed validators function
:param string:
:type string:
:return:
:rtype:
"""
for validator in validators:
if validator(string):
return True
return False
return composed
-34
View File
@@ -1,34 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Words utils
"""
from collections import namedtuple
from . import seps
_Word = namedtuple('_Word', ['span', 'value'])
def iter_words(string):
"""
Iterate on all words in a string
:param string:
:type string:
:return:
:rtype: iterable[str]
"""
i = 0
last_sep_index = -1
inside_word = False
for char in string:
if ord(char) < 128 and char in seps: # Make sure we don't exclude unicode characters.
if inside_word:
yield _Word(span=(last_sep_index+1, i), value=string[last_sep_index+1:i])
inside_word = False
last_sep_index = i
else:
inside_word = True
i += 1
if inside_word:
yield _Word(span=(last_sep_index+1, i), value=string[last_sep_index+1:i])