parse torrent name

2020-04-27 20:31:31 +02:00
parent 288b900350
commit e9f6987324
5 changed files with 226 additions and 15 deletions
@@ -40,9 +40,11 @@ def mainlist(item):
@support.scrape
 def peliculas(item):
    ptn = True
    patron = r'>(?P<quality>[^"<]+)</td> <TD[^>]+><A class="tab" HREF="(?P<url>[^"]+)"\s*>(?P<title>[^<]+)<[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>(?P<size>[^<]+)<[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>[^>]+>(?P<seed>[^<]+)'
    def itemHook(item):
-        item.title = item.title.replace('.',' ')
+        # item.title = item.title.replace('.',' ')
        item.contentType = item.args[1]
        thumb = (item.args[1] if type(item.args) == list else item.args) + '.png'
        item.thumbnail = support.thumb(thumb=thumb)
        return item
@@ -6,6 +6,9 @@ import inspect
 import os
 import re
 import sys
 from lib.PTN import PTN
 PY3 = False
 if sys.version_info[0] >= 3: PY3 = True; unicode = str; unichr = chr; long = int
 if PY3:
@@ -171,7 +174,7 @@ def cleantitle(title):
    cleantitle = title.replace('"', "'").replace('×', 'x').replace('–', '-').strip()
    return cleantitle
-def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, typeContentDict, typeActionDict, blacklist, search, pag, function, lang):
+def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, typeContentDict, typeActionDict, blacklist, search, pag, function, lang, ptn):
    itemlist = []
    log("scrapeBlock qui")
    if debug:
@@ -240,17 +243,6 @@ def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, t
        Type = scraped['type'] if scraped['type'] else ''
        plot = cleantitle(scraped["plot"]) if scraped["plot"] else ''
        # make formatted Title [longtitle]
        s = ' - '
        title = episode + (s if episode and title else '') + title
        longtitle = title + (s if title and title2 else '') + title2
        longtitle = typo(longtitle, 'bold')
        longtitle += typo(quality, '_ [] color kod') if quality else ''
        longtitle += typo(scraped['size'], '_ [] color kod') if scraped['size'] else ''
        longtitle += typo(scraped['seed']+ ' SEEDS', '_ [] color kod') if scraped['seed'] else ''
        lang1, longtitle = scrapeLang(scraped, lang, longtitle)
        # if title is set, probably this is a list of episodes or video sources
        # necessaria l'aggiunta di == scraped["title"] altrimenti non prende i gruppi dopo le categorie
        if item.infoLabels["title"] == scraped["title"]:
@@ -275,6 +267,28 @@ def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, t
            if scraped["rating"]:
                infolabels['rating'] = scrapertools.decodeHtmlentities(scraped["rating"])
        # make formatted Title [longtitle]
        s = ' - '
        title = episode + (s if episode and title else '') + title
        longtitle = title + (s if title and title2 else '') + title2
        lang1, longtitle = scrapeLang(scraped, lang, longtitle)
        if ptn:
            titlePTN = PTN().parse(title)
            title = titlePTN.get('title', '')
            if titlePTN.get('quality', '') or titlePTN.get('resolution', ''):
                quality = titlePTN.get('quality', '') + " " + titlePTN.get('resolution', '')
            if not scraped['year']:
                infolabels['year'] = titlePTN.get('year', '')
            if titlePTN.get('episode', None) and titlePTN.get('season', None):
                longtitle = title + ' - ' + str(titlePTN.get('episode')) + 'x' + str(titlePTN.get('season'))
        longtitle = typo(longtitle, 'bold')
        longtitle += typo(quality, '_ [] color kod') if quality else ''
        longtitle += typo(scraped['size'], '_ [] color kod') if scraped['size'] else ''
        longtitle += typo(scraped['seed'] + ' SEEDS', '_ [] color kod') if scraped['seed'] else ''
        AC = CT = ''
        if typeContentDict:
            for name, variants in typeContentDict.items():
@@ -380,6 +394,7 @@ def scrape(func):
        if 'pagination' in args and inspect.stack()[1][3] not in ['add_tvshow', 'get_episodes', 'update', 'find_episodes']: pagination = args['pagination'] if args['pagination'] else 20
        else: pagination = ''
        lang = args['deflang'] if 'deflang' in args else ''
        ptn = args.get('ptn', False)
        pag = item.page if item.page else 1  # pagination
        matches = []
@@ -402,7 +417,7 @@ def scrape(func):
                    if 'season' in bl and bl['season']:
                        item.season = bl['season']
                    blockItemlist, blockMatches = scrapeBlock(item, args, bl['block'], patron, headers, action, pagination, debug,
-                                                typeContentDict, typeActionDict, blacklist, search, pag, function, lang)
+                                                typeContentDict, typeActionDict, blacklist, search, pag, function, lang, ptn)
                    for it in blockItemlist:
                        if 'lang' in bl:
                            it.contentLanguage, it.title = scrapeLang(bl, it.contentLanguage, it.title)
@@ -413,7 +428,7 @@ def scrape(func):
                    matches.extend(blockMatches)
            elif patron:
                itemlist, matches = scrapeBlock(item, args, data, patron, headers, action, pagination, debug, typeContentDict,
-                                       typeActionDict, blacklist, search, pag, function, lang)
+                                       typeActionDict, blacklist, search, pag, function, lang, ptn)
            if 'itemlistHook' in args:
                itemlist = args['itemlistHook'](itemlist)
@@ -0,0 +1,15 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from .parse import PTN
 # https://github.com/divijbindlish/parse-torrent-name
 __author__ = 'Divij Bindlish'
 __email__ = 'dvjbndlsh93@gmail.com'
 __version__ = '1.1.1'
 __license__ = 'MIT'
 ptn = PTN()
 def parse(name):
    return ptn.parse(name)
@@ -0,0 +1,136 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import re
 from .patterns import patterns, types
 class PTN(object):
    def _escape_regex(self, string):
        return re.sub('[\-\[\]{}()*+?.,\\\^$|#\s]', '\\$&', string)
    def __init__(self):
        self.torrent = None
        self.excess_raw = None
        self.group_raw = None
        self.start = None
        self.end = None
        self.title_raw = None
        self.parts = None
    def _part(self, name, match, raw, clean):
        # The main core instructuions
        self.parts[name] = clean
        if len(match) != 0:
            # The instructions for extracting title
            index = self.torrent['name'].find(match[0])
            if index == 0:
                self.start = len(match[0])
            elif self.end is None or index < self.end:
                self.end = index
        if name != 'excess':
            # The instructions for adding excess
            if name == 'group':
                self.group_raw = raw
            if raw is not None:
                self.excess_raw = self.excess_raw.replace(raw, '')
    def _late(self, name, clean):
        if name == 'group':
            self._part(name, [], None, clean)
        elif name == 'episodeName':
            clean = re.sub('[\._]', ' ', clean)
            clean = re.sub('_+$', '', clean)
            self._part(name, [], None, clean.strip())
    def parse(self, name):
        self.parts = {}
        self.torrent = {'name': name}
        self.excess_raw = name
        self.group_raw = ''
        self.start = 0
        self.end = None
        self.title_raw = None
        for key, pattern in patterns:
            if key not in ('season', 'episode', 'website'):
                pattern = r'\b%s\b' % pattern
            clean_name = re.sub('_', ' ', self.torrent['name'])
            match = re.findall(pattern, clean_name, re.I)
            if len(match) == 0:
                continue
            index = {}
            if isinstance(match[0], tuple):
                match = list(match[0])
            if len(match) > 1:
                index['raw'] = 0
                index['clean'] = 1
            else:
                index['raw'] = 0
                index['clean'] = 0
            if key in types.keys() and types[key] == 'boolean':
                clean = True
            else:
                clean = match[index['clean']]
                if key in types.keys() and types[key] == 'integer':
                    clean = int(clean)
            if key == 'group':
                if re.search(patterns[5][1], clean, re.I) \
                        or re.search(patterns[4][1], clean):
                    continue  # Codec and quality.
                if re.match('[^ ]+ [^ ]+ .+', clean):
                    key = 'episodeName'
            if key == 'episode':
                sub_pattern = self._escape_regex(match[index['raw']])
                self.torrent['map'] = re.sub(
                    sub_pattern, '{episode}', self.torrent['name']
                )
            self._part(key, match, match[index['raw']], clean)
        # Start process for title
        raw = self.torrent['name']
        if self.end is not None:
            raw = raw[self.start:self.end].split('(')[0]
        clean = re.sub('^ -', '', raw)
        if clean.find(' ') == -1 and clean.find('.') != -1:
            clean = re.sub('\.', ' ', clean)
        clean = re.sub('_', ' ', clean)
        clean = re.sub('([\[\(_]|- )$', '', clean).strip()
        self._part('title', [], raw, clean)
        # Start process for end
        clean = re.sub('(^[-\. ()]+)|([-\. ]+$)', '', self.excess_raw)
        clean = re.sub('[\(\)\/]', ' ', clean)
        match = re.split('\.\.+| +', clean)
        if len(match) > 0 and isinstance(match[0], tuple):
            match = list(match[0])
        clean = filter(bool, match)
        clean = [item for item in filter(lambda a: a != '-', clean)]
        clean = [item.strip('-') for item in clean]
        if len(clean) != 0:
            group_pattern = clean[-1] + self.group_raw
            if self.torrent['name'].find(group_pattern) == \
                    len(self.torrent['name']) - len(group_pattern):
                self._late('group', clean.pop() + self.group_raw)
            if 'map' in self.torrent.keys() and len(clean) != 0:
                episode_name_pattern = (
                    '{episode}'
                    '' + re.sub('_+$', '', clean[0])
                )
                if self.torrent['map'].find(episode_name_pattern) != -1:
                    self._late('episodeName', clean.pop(0))
        if len(clean) != 0:
            if len(clean) == 1:
                clean = clean[0]
            self._part('excess', [], self.excess_raw, clean)
        return self.parts
@@ -0,0 +1,43 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 patterns = [
    ('season', '(s?([0-9]{1,2}))[ex]'),
    ('episode', '([ex]([0-9]{2})(?:[^0-9]|$))'),
    ('year', '([\[\(]?((?:19[0-9]|20[01])[0-9])[\]\)]?)'),
    ('resolution', '([0-9]{3,4}p)'),
    ('quality', ('((?:PPV\.)?[HP]DTV|(?:HD)?CAM|B[DR]Rip|(?:HD-?)?TS|'
                 '(?:PPV )?WEB-?DL(?: DVDRip)?|HDRip|DVDRip|DVDRIP|'
                 'CamRip|W[EB]BRip|BluRay|DvDScr|hdtv|telesync)')),
    ('codec', '(xvid|[hx]\.?26[45])'),
    ('audio', ('(MP3|DD5\.?1|Dual[\- ]Audio|LiNE|DTS|'
               'AAC[.-]LC|AAC(?:\.?2\.0)?|'
               'AC3(?:\.5\.1)?)')),
    ('group', '(- ?([^-]+(?:-={[^-]+-?$)?))$'),
    ('region', 'R[0-9]'),
    ('extended', '(EXTENDED(:?.CUT)?)'),
    ('hardcoded', 'HC'),
    ('proper', 'PROPER'),
    ('repack', 'REPACK'),
    ('container', '(MKV|AVI|MP4)'),
    ('widescreen', 'WS'),
    ('website', '^(\[ ?([^\]]+?) ?\])'),
    ('language', '(rus\.eng|ita\.eng)'),
    ('sbs', '(?:Half-)?SBS'),
    ('unrated', 'UNRATED'),
    ('size', '(\d+(?:\.\d+)?(?:GB|MB))'),
    ('3d', '3D')
 ]
 types = {
    'season': 'integer',
    'episode': 'integer',
    'year': 'integer',
    'extended': 'boolean',
    'hardcoded': 'boolean',
    'proper': 'boolean',
    'repack': 'boolean',
    'widescreen': 'boolean',
    'unrated': 'boolean',
    '3d': 'boolean'
 }