Commentato support.match
This commit is contained in:
+47
-24
@@ -157,8 +157,7 @@ def scrapeLang(scraped, lang, longtitle):
|
|||||||
return language, longtitle
|
return language, longtitle
|
||||||
|
|
||||||
def cleantitle(title):
|
def cleantitle(title):
|
||||||
if type(title) != str: title.decode('UTF-8')
|
cleantitle = scrapertools.htmlclean(scrapertools.decodeHtmlentities(title).replace('"', "'").replace('×', 'x').replace('–', '-')).strip()
|
||||||
cleantitle = title.replace('"', "'").replace('×', 'x').replace('–', '-').strip()
|
|
||||||
return cleantitle
|
return cleantitle
|
||||||
|
|
||||||
def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, typeContentDict, typeActionDict, blacklist, search, pag, function, lang):
|
def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, typeContentDict, typeActionDict, blacklist, search, pag, function, lang):
|
||||||
@@ -193,20 +192,16 @@ def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, t
|
|||||||
for i, match in enumerate(matches):
|
for i, match in enumerate(matches):
|
||||||
if pagination and (pag - 1) * pagination > i and not search: continue # pagination
|
if pagination and (pag - 1) * pagination > i and not search: continue # pagination
|
||||||
if pagination and i >= pag * pagination and not search: break # pagination
|
if pagination and i >= pag * pagination and not search: break # pagination
|
||||||
# listGroups = match.keys()
|
listGroups = match.keys()
|
||||||
# match = match.values()
|
match = match.values()
|
||||||
|
|
||||||
# if len(listGroups) > len(match): # to fix a bug
|
if len(listGroups) > len(match): # to fix a bug
|
||||||
# support.log()
|
match = list(match)
|
||||||
# match = list(match)
|
match.extend([''] * (len(listGroups) - len(match)))
|
||||||
# match.extend([''] * (len(listGroups) - len(match)))
|
|
||||||
|
|
||||||
scraped = {}
|
scraped = {}
|
||||||
for kk in known_keys:
|
for kk in known_keys:
|
||||||
# log('KK=',kk)
|
val = match[listGroups.index(kk)] if kk in listGroups else ''
|
||||||
# log('LIST',list(listGroups))
|
|
||||||
# log(match[dict_values])
|
|
||||||
val = match[kk] if kk in match else ''
|
|
||||||
if val and (kk == "url" or kk == 'thumb') and 'http' not in val:
|
if val and (kk == "url" or kk == 'thumb') and 'http' not in val:
|
||||||
val = scrapertools.find_single_match(item.url, 'https?://[a-z0-9.-]+') + (val if val.startswith('/') else '/' + val)
|
val = scrapertools.find_single_match(item.url, 'https?://[a-z0-9.-]+') + (val if val.startswith('/') else '/' + val)
|
||||||
scraped[kk] = val
|
scraped[kk] = val
|
||||||
@@ -299,8 +294,8 @@ def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, t
|
|||||||
other = scraped['other'] if scraped['other'] else ''
|
other = scraped['other'] if scraped['other'] else ''
|
||||||
)
|
)
|
||||||
|
|
||||||
for lg in list(set(match.keys()).difference(known_keys)):
|
for lg in list(set(listGroups).difference(known_keys)):
|
||||||
it.__setattr__(lg, match[lg])
|
it.__setattr__(lg, match[listGroups.index(lg)])
|
||||||
|
|
||||||
if 'itemHook' in args:
|
if 'itemHook' in args:
|
||||||
it = args['itemHook'](it)
|
it = args['itemHook'](it)
|
||||||
@@ -381,8 +376,8 @@ def scrape(func):
|
|||||||
jsontools.update_node(host, func.__module__.split('.')[-1], 'url')
|
jsontools.update_node(host, func.__module__.split('.')[-1], 'url')
|
||||||
parse[1] = scrapertools.get_domain_from_url(host)
|
parse[1] = scrapertools.get_domain_from_url(host)
|
||||||
item.url = urlparse.urlunparse(parse)
|
item.url = urlparse.urlunparse(parse)
|
||||||
page = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True, session=item.session)
|
page = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True,
|
||||||
|
session=item.session)
|
||||||
data = page.data.replace("'", '"')
|
data = page.data.replace("'", '"')
|
||||||
data = re.sub('\n|\t', ' ', data)
|
data = re.sub('\n|\t', ' ', data)
|
||||||
data = re.sub(r'>\s+<', '> <', data)
|
data = re.sub(r'>\s+<', '> <', data)
|
||||||
@@ -749,7 +744,7 @@ def typo(string, typography=''):
|
|||||||
if '{}' in string:
|
if '{}' in string:
|
||||||
string = '{' + re.sub(r'\s\{\}','',string) + '}'
|
string = '{' + re.sub(r'\s\{\}','',string) + '}'
|
||||||
if 'submenu' in string:
|
if 'submenu' in string:
|
||||||
string = "•• " + re.sub(r'\ssubmenu','',string)
|
string = u"\u2022\u2022 ".encode('utf-8') + re.sub(r'\ssubmenu','',string)
|
||||||
if 'color' in string:
|
if 'color' in string:
|
||||||
color = scrapertools.find_single_match(string, 'color ([a-z]+)')
|
color = scrapertools.find_single_match(string, 'color ([a-z]+)')
|
||||||
if color == 'kod' or '': color = kod_color
|
if color == 'kod' or '': color = kod_color
|
||||||
@@ -763,7 +758,7 @@ def typo(string, typography=''):
|
|||||||
if '--' in string:
|
if '--' in string:
|
||||||
string = ' - ' + re.sub(r'\s--','',string)
|
string = ' - ' + re.sub(r'\s--','',string)
|
||||||
if 'bullet' in string:
|
if 'bullet' in string:
|
||||||
string = '[B]•[/B] ' + re.sub(r'\sbullet','',string)
|
string = '[B]' + u"\u2022".encode('utf-8') + '[/B] ' + re.sub(r'\sbullet','',string)
|
||||||
|
|
||||||
return string
|
return string
|
||||||
|
|
||||||
@@ -771,10 +766,33 @@ def typo(string, typography=''):
|
|||||||
def match(item_url_string, **args):
|
def match(item_url_string, **args):
|
||||||
'''
|
'''
|
||||||
match is a function that combines httptools and scraper tools:
|
match is a function that combines httptools and scraper tools:
|
||||||
|
|
||||||
|
supports all httptools and the following arggs:
|
||||||
|
@param item_url_string: if it's a titem download the page item.url, if it's a URL download the page, if it's a string pass it to scrapertools
|
||||||
|
@type item_url_string: item or str
|
||||||
|
@param string: force item_url_string to be a string
|
||||||
|
@type string: bool
|
||||||
|
@param patronBlock: find first element in patron
|
||||||
|
@type patronBlock: str
|
||||||
|
@param patronBloks: find multiple matches
|
||||||
|
@type patronBloks: str or list
|
||||||
|
@param debugBlock: regex101.com for debug
|
||||||
|
@type debugBlock: bool
|
||||||
|
@param patron: find multiple matches on block, blocks or data
|
||||||
|
@type patron: str or list
|
||||||
|
@param debug: regex101.com for debug
|
||||||
|
@type debug: bool
|
||||||
|
|
||||||
|
Return a item with the following key:
|
||||||
|
data: data of the webpage
|
||||||
|
block: first block
|
||||||
|
blocks: all the blocks
|
||||||
|
match: first match
|
||||||
|
matches: all the matches
|
||||||
'''
|
'''
|
||||||
log(item_url_string)
|
log(item_url_string)
|
||||||
|
|
||||||
matches = []
|
matches = blocks = []
|
||||||
url = None
|
url = None
|
||||||
# arguments allowed for scrape
|
# arguments allowed for scrape
|
||||||
patron = args.get('patron', None)
|
patron = args.get('patron', None)
|
||||||
@@ -783,12 +801,15 @@ def match(item_url_string, **args):
|
|||||||
debug = args.get('debug', False)
|
debug = args.get('debug', False)
|
||||||
debugBlock = args.get('debugBlock', False)
|
debugBlock = args.get('debugBlock', False)
|
||||||
string = args.get('string', False)
|
string = args.get('string', False)
|
||||||
|
|
||||||
# remove scrape arguments
|
# remove scrape arguments
|
||||||
args = dict([(key, val) for key, val in args.items() if key not in ['patron', 'patronBlock', 'patronBlocks', 'debug', 'debugBlock', 'string']])
|
args = dict([(key, val) for key, val in args.items() if key not in ['patron', 'patronBlock', 'patronBlocks', 'debug', 'debugBlock', 'string']])
|
||||||
# dbg()
|
|
||||||
# check type of item_url_string
|
# check type of item_url_string
|
||||||
if type(item_url_string) == str:
|
if string:
|
||||||
if item_url_string.startswith('http') and not string: url = item_url_string
|
data = item_url_string
|
||||||
|
elif type(item_url_string) == str:
|
||||||
|
if item_url_string.startswith('http'): url = item_url_string
|
||||||
else : data = item_url_string
|
else : data = item_url_string
|
||||||
else:
|
else:
|
||||||
# if item_url_string is an item use item.url as url
|
# if item_url_string is an item use item.url as url
|
||||||
@@ -808,7 +829,9 @@ def match(item_url_string, **args):
|
|||||||
if patronBlock:
|
if patronBlock:
|
||||||
blocks = [scrapertools.find_single_match(data, patronBlock)]
|
blocks = [scrapertools.find_single_match(data, patronBlock)]
|
||||||
elif patronBlocks:
|
elif patronBlocks:
|
||||||
blocks = scrapertools.find_multiple_matches(data, patronBlock)
|
if type(patronBlock) == str: patron = [patronBlock]
|
||||||
|
for p in patronBlock:
|
||||||
|
blocks += scrapertools.find_multiple_matches(data, p)
|
||||||
else:
|
else:
|
||||||
blocks = [data]
|
blocks = [data]
|
||||||
|
|
||||||
@@ -1041,7 +1064,7 @@ def controls(itemlist, item, AutoPlay=True, CheckLinks=True, down_load=True):
|
|||||||
channel_node = autoplay_node.get(item.channel, {})
|
channel_node = autoplay_node.get(item.channel, {})
|
||||||
settings_node = channel_node.get('settings', {})
|
settings_node = channel_node.get('settings', {})
|
||||||
AP = get_setting('autoplay') or settings_node['active']
|
AP = get_setting('autoplay') or settings_node['active']
|
||||||
HS = config.get_setting('hide_servers') or (settings_node['hide_servers'] if 'hide_server' in settings_node else False)
|
HS = config.get_setting('hide_servers') or (settings_node['hide_servers'] if settings_node.has_key('hide_server') else False)
|
||||||
|
|
||||||
if CL and not AP:
|
if CL and not AP:
|
||||||
if get_setting('checklinks', item.channel):
|
if get_setting('checklinks', item.channel):
|
||||||
|
|||||||
Reference in New Issue
Block a user