fix CF
This commit is contained in:
@@ -22,12 +22,6 @@ host = config.get_channel_url(findhost)
|
||||
headers = [['Referer', host]]
|
||||
|
||||
|
||||
|
||||
|
||||
checklinks = config.get_setting('checklinks', 'cineblog01')
|
||||
checklinks_number = config.get_setting('checklinks_number', 'cineblog01')
|
||||
|
||||
|
||||
@support.menu
|
||||
def mainlist(item):
|
||||
film = [
|
||||
|
||||
@@ -49,15 +49,14 @@ HTTPTOOLS_DEFAULT_RANDOM_HEADERS = False
|
||||
# domainCF.append(urlparse.urlparse(config.get_channel_url(name=ch)).hostname)
|
||||
# domainCF.extend(otherCF)
|
||||
|
||||
global CF_LIST
|
||||
CF_LIST = list()
|
||||
CF_LIST_PATH = os.path.join(config.get_data_path(), "CF_Domains.txt")
|
||||
# CF_LIST = list()
|
||||
# CF_LIST_PATH = os.path.join(config.get_data_path(), "CF_Domains.txt")
|
||||
#
|
||||
# if os.path.exists(CF_LIST_PATH):
|
||||
# with open(CF_LIST_PATH, "rb") as CF_File:
|
||||
# CF_LIST = CF_File.read().splitlines()
|
||||
|
||||
if os.path.exists(CF_LIST_PATH):
|
||||
with open(CF_LIST_PATH, "rb") as CF_File:
|
||||
CF_LIST = CF_File.read().splitlines()
|
||||
|
||||
FORCE_CLOUDSCRAPER_LIST = ['akvideo.stream']
|
||||
FORCE_CLOUDSCRAPER_LIST = []
|
||||
|
||||
def get_user_agent():
|
||||
# Returns the global user agent to be used when necessary for the url.
|
||||
@@ -268,7 +267,7 @@ def downloadpage(url, **opt):
|
||||
"""
|
||||
url = scrapertools.unescape(url)
|
||||
domain = urlparse.urlparse(url).netloc
|
||||
global CF_LIST
|
||||
# global CF_LIST
|
||||
CF = False
|
||||
|
||||
if domain in FORCE_CLOUDSCRAPER_LIST:
|
||||
@@ -279,7 +278,8 @@ def downloadpage(url, **opt):
|
||||
from lib import requests
|
||||
session = requests.session()
|
||||
|
||||
if domain in CF_LIST or opt.get('CF', False):
|
||||
# if domain in CF_LIST or opt.get('CF', False):
|
||||
if opt.get('CF', False):
|
||||
url = 'https://web.archive.org/save/' + url
|
||||
CF = True
|
||||
|
||||
@@ -398,20 +398,22 @@ def downloadpage(url, **opt):
|
||||
return type('HTTPResponse', (), response)
|
||||
|
||||
response_code = req.status_code
|
||||
response['data'] = req.content if req.content else ''
|
||||
response['url'] = req.url
|
||||
|
||||
if req.headers.get('Server', '').startswith('cloudflare') and response_code in [429, 503, 403] and not opt.get('CF', False):
|
||||
if domain not in CF_LIST:
|
||||
if req.headers.get('Server', '').startswith('cloudflare') and response_code in [429, 503, 403]\
|
||||
and not opt.get('CF', False) and 'Please turn JavaScript on and reload the page' in response['data']:
|
||||
# if domain not in CF_LIST:
|
||||
opt["CF"] = True
|
||||
with open(CF_LIST_PATH, "a") as CF_File:
|
||||
CF_File.write("%s\n" % domain)
|
||||
# with open(CF_LIST_PATH, "a") as CF_File:
|
||||
# CF_File.write("%s\n" % domain)
|
||||
logger.debug("CF retry... for domain: %s" % domain)
|
||||
return downloadpage(url, **opt)
|
||||
|
||||
response['data'] = req.content if req.content else ''
|
||||
if CF:
|
||||
import re
|
||||
response['data'] = re.sub('["|\']/save/[^"]*(https?://[^"]+)', '"\\1', response['data'])
|
||||
response['url'] = req.url
|
||||
response['url'] = response['url'].replace('https://web.archive.org/save/', '')
|
||||
|
||||
if type(response['data']) != str:
|
||||
response['data'] = response['data'].decode('UTF-8')
|
||||
|
||||
@@ -110,7 +110,7 @@ def get_channel_url(findhostMethod=None, name=None):
|
||||
name = os.path.basename(frame[0].f_code.co_filename).replace('.py', '')
|
||||
if findhostMethod:
|
||||
url = jsontools.get_node_from_file(name, 'url')
|
||||
if not url:
|
||||
if not url or 'web.archive.org' in url: # per eliminare tutti i webarchive salvati causa bug httptools CF, eliminare in futuro
|
||||
url = findhostMethod()
|
||||
jsontools.update_node(url, name, 'url')
|
||||
return url
|
||||
|
||||
Reference in New Issue
Block a user