From 1fd9338ea8da252e2fc48761430c4893bc196162 Mon Sep 17 00:00:00 2001 From: Alhaziel Date: Mon, 17 Feb 2020 11:14:52 +0100 Subject: [PATCH] Commentato support.match --- core/support.py | 71 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/core/support.py b/core/support.py index 012b51a2..5c44314d 100755 --- a/core/support.py +++ b/core/support.py @@ -157,8 +157,7 @@ def scrapeLang(scraped, lang, longtitle): return language, longtitle def cleantitle(title): - if type(title) != str: title.decode('UTF-8') - cleantitle = title.replace('"', "'").replace('×', 'x').replace('–', '-').strip() + cleantitle = scrapertools.htmlclean(scrapertools.decodeHtmlentities(title).replace('"', "'").replace('×', 'x').replace('–', '-')).strip() return cleantitle def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, typeContentDict, typeActionDict, blacklist, search, pag, function, lang): @@ -193,20 +192,16 @@ def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, t for i, match in enumerate(matches): if pagination and (pag - 1) * pagination > i and not search: continue # pagination if pagination and i >= pag * pagination and not search: break # pagination - # listGroups = match.keys() - # match = match.values() + listGroups = match.keys() + match = match.values() - # if len(listGroups) > len(match): # to fix a bug - # support.log() - # match = list(match) - # match.extend([''] * (len(listGroups) - len(match))) + if len(listGroups) > len(match): # to fix a bug + match = list(match) + match.extend([''] * (len(listGroups) - len(match))) scraped = {} for kk in known_keys: - # log('KK=',kk) - # log('LIST',list(listGroups)) - # log(match[dict_values]) - val = match[kk] if kk in match else '' + val = match[listGroups.index(kk)] if kk in listGroups else '' if val and (kk == "url" or kk == 'thumb') and 'http' not in val: val = scrapertools.find_single_match(item.url, 'https?://[a-z0-9.-]+') + (val if val.startswith('/') else '/' + val) scraped[kk] = val @@ -299,8 +294,8 @@ def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, t other = scraped['other'] if scraped['other'] else '' ) - for lg in list(set(match.keys()).difference(known_keys)): - it.__setattr__(lg, match[lg]) + for lg in list(set(listGroups).difference(known_keys)): + it.__setattr__(lg, match[listGroups.index(lg)]) if 'itemHook' in args: it = args['itemHook'](it) @@ -381,8 +376,8 @@ def scrape(func): jsontools.update_node(host, func.__module__.split('.')[-1], 'url') parse[1] = scrapertools.get_domain_from_url(host) item.url = urlparse.urlunparse(parse) - page = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True, session=item.session) - + page = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True, + session=item.session) data = page.data.replace("'", '"') data = re.sub('\n|\t', ' ', data) data = re.sub(r'>\s+<', '> <', data) @@ -749,7 +744,7 @@ def typo(string, typography=''): if '{}' in string: string = '{' + re.sub(r'\s\{\}','',string) + '}' if 'submenu' in string: - string = "•• " + re.sub(r'\ssubmenu','',string) + string = u"\u2022\u2022 ".encode('utf-8') + re.sub(r'\ssubmenu','',string) if 'color' in string: color = scrapertools.find_single_match(string, 'color ([a-z]+)') if color == 'kod' or '': color = kod_color @@ -763,7 +758,7 @@ def typo(string, typography=''): if '--' in string: string = ' - ' + re.sub(r'\s--','',string) if 'bullet' in string: - string = '[B]•[/B] ' + re.sub(r'\sbullet','',string) + string = '[B]' + u"\u2022".encode('utf-8') + '[/B] ' + re.sub(r'\sbullet','',string) return string @@ -771,10 +766,33 @@ def typo(string, typography=''): def match(item_url_string, **args): ''' match is a function that combines httptools and scraper tools: + + supports all httptools and the following arggs: + @param item_url_string: if it's a titem download the page item.url, if it's a URL download the page, if it's a string pass it to scrapertools + @type item_url_string: item or str + @param string: force item_url_string to be a string + @type string: bool + @param patronBlock: find first element in patron + @type patronBlock: str + @param patronBloks: find multiple matches + @type patronBloks: str or list + @param debugBlock: regex101.com for debug + @type debugBlock: bool + @param patron: find multiple matches on block, blocks or data + @type patron: str or list + @param debug: regex101.com for debug + @type debug: bool + + Return a item with the following key: + data: data of the webpage + block: first block + blocks: all the blocks + match: first match + matches: all the matches ''' log(item_url_string) - matches = [] + matches = blocks = [] url = None # arguments allowed for scrape patron = args.get('patron', None) @@ -783,12 +801,15 @@ def match(item_url_string, **args): debug = args.get('debug', False) debugBlock = args.get('debugBlock', False) string = args.get('string', False) + # remove scrape arguments args = dict([(key, val) for key, val in args.items() if key not in ['patron', 'patronBlock', 'patronBlocks', 'debug', 'debugBlock', 'string']]) - # dbg() + # check type of item_url_string - if type(item_url_string) == str: - if item_url_string.startswith('http') and not string: url = item_url_string + if string: + data = item_url_string + elif type(item_url_string) == str: + if item_url_string.startswith('http'): url = item_url_string else : data = item_url_string else: # if item_url_string is an item use item.url as url @@ -808,7 +829,9 @@ def match(item_url_string, **args): if patronBlock: blocks = [scrapertools.find_single_match(data, patronBlock)] elif patronBlocks: - blocks = scrapertools.find_multiple_matches(data, patronBlock) + if type(patronBlock) == str: patron = [patronBlock] + for p in patronBlock: + blocks += scrapertools.find_multiple_matches(data, p) else: blocks = [data] @@ -1041,7 +1064,7 @@ def controls(itemlist, item, AutoPlay=True, CheckLinks=True, down_load=True): channel_node = autoplay_node.get(item.channel, {}) settings_node = channel_node.get('settings', {}) AP = get_setting('autoplay') or settings_node['active'] - HS = config.get_setting('hide_servers') or (settings_node['hide_servers'] if 'hide_server' in settings_node else False) + HS = config.get_setting('hide_servers') or (settings_node['hide_servers'] if settings_node.has_key('hide_server') else False) if CL and not AP: if get_setting('checklinks', item.channel):