get_channel_url migliorato

This commit is contained in:
marco
2019-12-26 15:04:38 +01:00
parent aea46a7b42
commit 6a226785d2
40 changed files with 103 additions and 129 deletions
+10 -1
View File
@@ -377,7 +377,16 @@ def scrape(func):
log('PATRON= ', patron)
if not data:
data = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True, session=item.session).data.replace("'", '"')
page = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True, session=item.session)
# if url may be changed and channel has findhost to update
if (not page.data or scrapertoolsV2.get_domain_from_url(page.url) != scrapertoolsV2.get_domain_from_url(item.url)) and 'findhost' in func.__globals__:
host = func.__globals__['findhost']()
from core import jsontools
jsontools.update_node(host, func.__module__.split('.')[-1], 'url')
item.url = item.url.replace(scrapertoolsV2.get_domain_from_url(item.url), scrapertoolsV2.get_domain_from_url(host))
page = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True,
session=item.session)
data = page.data.replace("'", '"')
data = re.sub('\n|\t', ' ', data)
data = re.sub(r'>\s+<', '> <', data)
# replace all ' with " and eliminate newline, so we don't need to worry about