get_channel_url migliorato

2019-12-26 15:04:38 +01:00
parent aea46a7b42
commit 6a226785d2
40 changed files with 103 additions and 129 deletions
@@ -377,7 +377,16 @@ def scrape(func):

        log('PATRON= ', patron)
        if not data:
-            data = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True, session=item.session).data.replace("'", '"')
+            page = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True, session=item.session)
+            # if url may be changed and channel has findhost to update
+            if (not page.data or scrapertoolsV2.get_domain_from_url(page.url) != scrapertoolsV2.get_domain_from_url(item.url)) and 'findhost' in func.__globals__:
+                host = func.__globals__['findhost']()
+                from core import jsontools
+                jsontools.update_node(host, func.__module__.split('.')[-1], 'url')
+                item.url = item.url.replace(scrapertoolsV2.get_domain_from_url(item.url), scrapertoolsV2.get_domain_from_url(host))
+                page = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True,
+                                              session=item.session)
+            data = page.data.replace("'", '"')
            data = re.sub('\n|\t', ' ', data)
            data = re.sub(r'>\s+<', '> <', data)
            # replace all ' with " and eliminate newline, so we don't need to worry about