fix CF

2020-07-06 13:42:39 +02:00
parent 4f219e0e5e
commit 2b6c8b0d9a
3 changed files with 26 additions and 30 deletions
--- a/channels/cineblog01.py
+++ b/channels/cineblog01.py
@@ -22,12 +22,6 @@ host = config.get_channel_url(findhost)
 headers = [['Referer', host]]


-
-
-checklinks = config.get_setting('checklinks', 'cineblog01')
-checklinks_number = config.get_setting('checklinks_number', 'cineblog01')
-
-
@support.menu
 def mainlist(item):
    film = [
--- a/core/httptools.py
+++ b/core/httptools.py
@@ -49,15 +49,14 @@ HTTPTOOLS_DEFAULT_RANDOM_HEADERS = False
 #     domainCF.append(urlparse.urlparse(config.get_channel_url(name=ch)).hostname)
 # domainCF.extend(otherCF)

-global CF_LIST
-CF_LIST = list()
-CF_LIST_PATH = os.path.join(config.get_data_path(), "CF_Domains.txt")
+# CF_LIST = list()
+# CF_LIST_PATH = os.path.join(config.get_data_path(), "CF_Domains.txt")
+#
+# if os.path.exists(CF_LIST_PATH):
+#     with open(CF_LIST_PATH, "rb") as CF_File:
+#         CF_LIST = CF_File.read().splitlines()

-if os.path.exists(CF_LIST_PATH):
-    with open(CF_LIST_PATH, "rb") as CF_File:
-        CF_LIST = CF_File.read().splitlines()
-
-FORCE_CLOUDSCRAPER_LIST = ['akvideo.stream']
+FORCE_CLOUDSCRAPER_LIST = []

 def get_user_agent():
    # Returns the global user agent to be used when necessary for the url.
@@ -268,7 +267,7 @@ def downloadpage(url, **opt):
        """
    url = scrapertools.unescape(url)
    domain = urlparse.urlparse(url).netloc
-    global CF_LIST
+    # global CF_LIST
    CF = False

    if domain in FORCE_CLOUDSCRAPER_LIST:
@@ -279,7 +278,8 @@ def downloadpage(url, **opt):
        from lib import requests
        session = requests.session()

-        if domain in CF_LIST or opt.get('CF', False):
+        # if domain in CF_LIST or opt.get('CF', False):
+        if opt.get('CF', False):
            url = 'https://web.archive.org/save/' + url
            CF = True

@@ -398,20 +398,22 @@ def downloadpage(url, **opt):
        return type('HTTPResponse', (), response)

    response_code = req.status_code
+    response['data'] = req.content if req.content else ''
+    response['url'] = req.url

-    if req.headers.get('Server', '').startswith('cloudflare') and response_code in [429, 503, 403] and not opt.get('CF', False):
-        if domain not in CF_LIST:
+    if req.headers.get('Server', '').startswith('cloudflare') and response_code in [429, 503, 403]\
+            and not opt.get('CF', False) and 'Please turn JavaScript on and reload the page' in response['data']:
+        # if domain not in CF_LIST:
        opt["CF"] = True
-            with open(CF_LIST_PATH, "a") as CF_File:
-                CF_File.write("%s\n" % domain)
+        # with open(CF_LIST_PATH, "a") as CF_File:
+        #     CF_File.write("%s\n" % domain)
        logger.debug("CF retry... for domain: %s" % domain)
        return downloadpage(url, **opt)

-    response['data'] = req.content if req.content else ''
    if CF:
        import re
        response['data'] = re.sub('["|\']/save/[^"]*(https?://[^"]+)', '"\\1', response['data'])
-    response['url'] = req.url
+        response['url'] = response['url'].replace('https://web.archive.org/save/', '')

    if type(response['data']) != str:
        response['data'] = response['data'].decode('UTF-8')
--- a/platformcode/config.py
+++ b/platformcode/config.py
@@ -110,7 +110,7 @@ def get_channel_url(findhostMethod=None, name=None):
        name = os.path.basename(frame[0].f_code.co_filename).replace('.py', '')
    if findhostMethod:
        url = jsontools.get_node_from_file(name, 'url')
-        if not url:
+        if not url or 'web.archive.org' in url:  # per eliminare tutti i webarchive salvati causa bug httptools CF, eliminare in futuro
            url = findhostMethod()
            jsontools.update_node(url, name, 'url')
        return url