Aggiornato Httptools e cloudscrape

Fix per Py3 e commenti in inglese
This commit is contained in:
Alhaziel
2019-11-20 12:10:41 +01:00
committed by marco
parent 7b0a3152de
commit b0a69f9d86
8 changed files with 12622 additions and 1104 deletions
+529 -401
View File
File diff suppressed because it is too large Load Diff
+79 -69
View File
@@ -1,10 +1,8 @@
# Cloudscraper by VeNoMouS
# https://github.com/VeNoMouS/cloudscraper
import logging import logging
import re import re
import sys import sys
import ssl import ssl
import requests
from copy import deepcopy from copy import deepcopy
from time import sleep from time import sleep
@@ -15,6 +13,7 @@ from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context from requests.packages.urllib3.util.ssl_ import create_urllib3_context
from .interpreters import JavaScriptInterpreter from .interpreters import JavaScriptInterpreter
from .reCaptcha import reCaptcha
from .user_agent import User_Agent from .user_agent import User_Agent
try: try:
@@ -29,20 +28,29 @@ except ImportError:
try: try:
from urlparse import urlparse from urlparse import urlparse
from urlparse import urlunparse
except ImportError: except ImportError:
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.parse import urlunparse
########################################################################################################################################################## ##########################################################################################################################################################
__version__ = '1.1.12' __version__ = '1.1.24'
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.' BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
########################################################################################################################################################## ##########################################################################################################################################################
# class CipherSuiteAdapter(HTTPAdapter):
#
# def __init__(self, cipherSuite=None, **kwargs):
# self.cipherSuite = cipherSuite
#
# self.ssl_context = create_urllib3_context(
# ssl_version=ssl.PROTOCOL_TLS,
# ciphers=self.cipherSuite
# )
#
# super(CipherSuiteAdapter, self).__init__(**kwargs)
class CipherSuiteAdapter(HTTPAdapter): class CipherSuiteAdapter(HTTPAdapter):
def __init__(self, cipherSuite=None, **kwargs): def __init__(self, cipherSuite=None, **kwargs):
@@ -75,10 +83,12 @@ class CipherSuiteAdapter(HTTPAdapter):
class CloudScraper(Session): class CloudScraper(Session):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.allow_brotli = kwargs.pop('allow_brotli', True if 'brotli' in sys.modules.keys() else False)
self.debug = kwargs.pop('debug', False) self.debug = kwargs.pop('debug', False)
self.delay = kwargs.pop('delay', None) self.delay = kwargs.pop('delay', None)
self.interpreter = kwargs.pop('interpreter', 'js2py') self.interpreter = kwargs.pop('interpreter', 'js2py')
self.allow_brotli = kwargs.pop('allow_brotli', True if 'brotli' in sys.modules.keys() else False) self.recaptcha = kwargs.pop('recaptcha', {})
self.cipherSuite = None self.cipherSuite = None
super(CloudScraper, self).__init__(*args, **kwargs) super(CloudScraper, self).__init__(*args, **kwargs)
@@ -108,21 +118,30 @@ class CloudScraper(Session):
if hasattr(ssl, 'PROTOCOL_TLS'): if hasattr(ssl, 'PROTOCOL_TLS'):
ciphers = [ ciphers = [
'ECDHE-ECDSA-AES128-GCM-SHA256', 'ECDHE-RSA-AES128-GCM-SHA256', 'ECDHE-ECDSA-AES256-GCM-SHA384', 'TLS13-AES-128-GCM-SHA256',
'ECDHE-RSA-AES256-GCM-SHA384', 'ECDHE-ECDSA-CHACHA20-POLY1305-SHA256', 'ECDHE-RSA-CHACHA20-POLY1305-SHA256', 'TLS13-AES-256-GCM-SHA384',
'ECDHE-RSA-AES128-CBC-SHA', 'ECDHE-RSA-AES256-CBC-SHA', 'RSA-AES128-GCM-SHA256', 'RSA-AES256-GCM-SHA384', 'TLS13-CHACHA20-POLY1305-SHA256',
'ECDHE-RSA-AES128-GCM-SHA256', 'RSA-AES256-SHA', '3DES-EDE-CBC' 'ECDHE-ECDSA-CHACHA20-POLY1305',
'ECDHE-ECDSA-AES128-GCM-SHA256',
'ECDHE-ECDSA-AES128-SHA',
'ECDHE-ECDSA-AES128-SHA256',
'ECDHE-ECDSA-AES256-GCM-SHA384',
'ECDHE-ECDSA-AES256-SHA',
'ECDHE-ECDSA-AES256-SHA384',
# Slip in some additional intermediate compatibility ciphers, This should help out users for non Cloudflare based sites.
'ECDHE-RSA-AES128-SHA256',
'ECDHE-RSA-AES256-SHA384',
'ECDHE-RSA-AES256-GCM-SHA384',
'DHE-RSA-AES128-GCM-SHA256',
'DHE-RSA-AES256-GCM-SHA384'
] ]
if hasattr(ssl, 'PROTOCOL_TLSv1_3'): ctx = ssl.SSLContext(ssl.PROTOCOL_TLS)
ciphers.insert(0, ['GREASE_3A', 'GREASE_6A', 'AES128-GCM-SHA256', 'AES256-GCM-SHA256', 'AES256-GCM-SHA384', 'CHACHA20-POLY1305-SHA256'])
ctx = ssl.SSLContext(getattr(ssl, 'PROTOCOL_TLSv1_3', ssl.PROTOCOL_TLSv1_2))
for cipher in ciphers: for cipher in ciphers:
try: try:
ctx.set_ciphers(cipher) ctx.set_ciphers(cipher)
self.cipherSuite = '{}:{}'.format(self.cipherSuite, cipher).rstrip(':') self.cipherSuite = '{}:{}'.format(self.cipherSuite, cipher).rstrip(':').lstrip(':')
except ssl.SSLError: except ssl.SSLError:
pass pass
@@ -134,7 +153,7 @@ class CloudScraper(Session):
ourSuper = super(CloudScraper, self) ourSuper = super(CloudScraper, self)
resp = ourSuper.request(method, url, *args, **kwargs) resp = ourSuper.request(method, url, *args, **kwargs)
if resp.headers.get('Content-Encoding') == 'br': if requests.packages.urllib3.__version__ < '1.25.1' and resp.headers.get('Content-Encoding') == 'br':
if self.allow_brotli and resp._content: if self.allow_brotli and resp._content:
resp._content = brotli.decompress(resp.content) resp._content = brotli.decompress(resp.content)
else: else:
@@ -163,12 +182,13 @@ class CloudScraper(Session):
@staticmethod @staticmethod
def isChallengeRequest(resp): def isChallengeRequest(resp):
if resp.headers.get('Server', '').startswith('cloudflare'): if resp.headers.get('Server', '').startswith('cloudflare'):
if b'why_captcha' in resp.content or b'/cdn-cgi/l/chk_captcha' in resp.content:
raise ValueError('Captcha')
return ( return (
resp.status_code in [429, 503] resp.status_code in [403, 429, 503]
and all(s in resp.content for s in [b'jschl_vc', b'jschl_answer']) and (
all(s in resp.content for s in [b'jschl_vc', b'jschl_answer'])
or
all(s in resp.content for s in [b'why_captcha', b'/cdn-cgi/l/chk_captcha'])
)
) )
return False return False
@@ -178,67 +198,56 @@ class CloudScraper(Session):
def sendChallengeResponse(self, resp, **original_kwargs): def sendChallengeResponse(self, resp, **original_kwargs):
body = resp.text body = resp.text
# Cloudflare requires a delay before solving the challenge
if not self.delay:
try:
delay = float(re.search(r'submit\(\);\r?\n\s*},\s*([0-9]+)', body).group(1)) / float(1000)
if isinstance(delay, (int, float)):
self.delay = delay
except: # noqa
pass
sleep(self.delay)
parsed_url = urlparse(resp.url) parsed_url = urlparse(resp.url)
domain = parsed_url.netloc domain = parsed_url.netloc
submit_url = '{}://{}/cdn-cgi/l/chk_jschl'.format(parsed_url.scheme, domain)
cloudflare_kwargs = deepcopy(original_kwargs) params = OrderedDict()
try: s = re.search(r'name="s"\svalue="(?P<s_value>[^"]+)', body)
params = OrderedDict() if s:
params['s'] = s.group('s_value')
s = re.search(r'name="s"\svalue="(?P<s_value>[^"]+)', body) if b'/cdn-cgi/l/chk_captcha' in resp.content:
if s: if not self.recaptcha or not isinstance(self.recaptcha, dict) or not self.recaptcha.get('provider'):
params['s'] = s.group('s_value') sys.tracebacklimit = 0
raise RuntimeError("Cloudflare reCaptcha detected, unfortunately you haven't loaded an anti reCaptcha provider correctly via the 'recaptcha' parameter.")
params.update( submit_url = '{}://{}/cdn-cgi/l/chk_captcha'.format(parsed_url.scheme, domain)
[ self.recaptcha['proxies'] = self.proxies
('jschl_vc', re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)), params['g-recaptcha-response'] = reCaptcha.dynamicImport(self.recaptcha.get('provider').lower()).solveCaptcha(resp, self.recaptcha)
('pass', re.search(r'name="pass" value="(.+?)"', body).group(1)) else:
] # Cloudflare requires a delay before solving the challenge
) if not self.delay:
try:
delay = float(re.search(r'submit\(\);\r?\n\s*},\s*([0-9]+)', body).group(1)) / float(1000)
if isinstance(delay, (int, float)):
self.delay = delay
except: # noqa
pass
params = cloudflare_kwargs.setdefault('params', params) sleep(self.delay)
submit_url = '{}://{}/cdn-cgi/l/chk_jschl'.format(parsed_url.scheme, domain)
except Exception as e: try:
raise ValueError('Unable to parse Cloudflare anti-bots page: {} {}'.format(e.message, BUG_REPORT)) params.update(
[
# Solve the Javascript challenge ('jschl_vc', re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)),
params['jschl_answer'] = JavaScriptInterpreter.dynamicImport(self.interpreter).solveChallenge(body, domain) ('pass', re.search(r'name="pass" value="(.+?)"', body).group(1)),
('jschl_answer', JavaScriptInterpreter.dynamicImport(self.interpreter).solveChallenge(body, domain))
]
)
except Exception as e:
raise ValueError('Unable to parse Cloudflare anti-bots page: {} {}'.format(e.message, BUG_REPORT))
# Requests transforms any request into a GET after a redirect, # Requests transforms any request into a GET after a redirect,
# so the redirect has to be handled manually here to allow for # so the redirect has to be handled manually here to allow for
# performing other types of requests even as the first request. # performing other types of requests even as the first request.
cloudflare_kwargs = deepcopy(original_kwargs)
cloudflare_kwargs.setdefault('params', params)
cloudflare_kwargs['allow_redirects'] = False cloudflare_kwargs['allow_redirects'] = False
self.request(resp.request.method, submit_url, **cloudflare_kwargs)
redirect = self.request(resp.request.method, submit_url, **cloudflare_kwargs) return self.request(resp.request.method, resp.url, **original_kwargs)
redirect_location = urlparse(redirect.headers['Location'])
if not redirect_location.netloc:
redirect_url = urlunparse(
(
parsed_url.scheme,
domain,
redirect_location.path,
redirect_location.params,
redirect_location.query,
redirect_location.fragment
)
)
return self.request(resp.request.method, redirect_url, **original_kwargs)
return self.request(resp.request.method, redirect.headers['Location'], **original_kwargs)
########################################################################################################################################################## ##########################################################################################################################################################
@@ -268,6 +277,7 @@ class CloudScraper(Session):
delay=kwargs.pop('delay', None), delay=kwargs.pop('delay', None),
interpreter=kwargs.pop('interpreter', 'js2py'), interpreter=kwargs.pop('interpreter', 'js2py'),
allow_brotli=kwargs.pop('allow_brotli', True), allow_brotli=kwargs.pop('allow_brotli', True),
recaptcha=kwargs.pop('recaptcha', {})
) )
try: try:
+1 -1
View File
@@ -19,7 +19,7 @@ class ChallengeInterpreter(JavaScriptInterpreter):
def eval(self, jsEnv, js): def eval(self, jsEnv, js):
try: try:
return v8eval.V8().eval('{}{}'.format(jsEnv, js)) return v8eval.V8().eval('{}{}'.format(jsEnv, js))
except: # noqa except: # noqa
RuntimeError('We encountered an error running the V8 Engine.') RuntimeError('We encountered an error running the V8 Engine.')
+48
View File
@@ -0,0 +1,48 @@
import re
import sys
import logging
import abc
if sys.version_info >= (3, 4):
ABC = abc.ABC # noqa
else:
ABC = abc.ABCMeta('ABC', (), {})
##########################################################################################################################################################
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
##########################################################################################################################################################
captchaSolvers = {}
class reCaptcha(ABC):
@abc.abstractmethod
def __init__(self, name):
captchaSolvers[name] = self
@classmethod
def dynamicImport(cls, name):
if name not in captchaSolvers:
try:
__import__('{}.{}'.format(cls.__module__, name))
if not isinstance(captchaSolvers.get(name), reCaptcha):
raise ImportError('The anti reCaptcha provider was not initialized.')
except ImportError:
logging.error("Unable to load {} anti reCaptcha provider".format(name))
raise
return captchaSolvers[name]
@abc.abstractmethod
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
pass
def solveCaptcha(self, ret, reCaptchaParams):
try:
site_key = re.search('data-sitekey="(.+?)"', ret.text).group(1)
except Exception as e:
raise ValueError("Unable to parse Cloudflare\'s reCaptcha variable 'data-sitekey': {} {}".format(e.message, BUG_REPORT))
return self.getCaptchaAnswer(ret.url, site_key, reCaptchaParams)
+42
View File
@@ -0,0 +1,42 @@
from __future__ import absolute_import
import sys
try:
from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, Proxy
except ImportError:
sys.tracebacklimit = 0
raise RuntimeError("Please install the python module 'python_anticaptcha' via pip or download it https://github.com/ad-m/python-anticaptcha")
from . import reCaptcha
class captchaSolver(reCaptcha):
def __init__(self):
super(captchaSolver, self).__init__('anticaptcha')
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
if not reCaptchaParams.get('api_key'):
raise ValueError("reCaptcha provider 'anticaptcha' was not provided an 'api_key' parameter.")
client = AnticaptchaClient(reCaptchaParams.get('api_key'))
if reCaptchaParams.get('proxy', False) and reCaptchaParams.get('proxies'):
client.session.proxies = reCaptchaParams.get('proxies')
task = NoCaptchaTask(
site_url,
site_key,
proxy=Proxy.parse_url(
reCaptchaParams.get('proxies').get('https')
)
)
else:
task = NoCaptchaTaskProxylessTask(site_url, site_key)
job = client.createTask(task)
job.join()
return job.get_solution_response()
captchaSolver()
@@ -0,0 +1,198 @@
from __future__ import absolute_import
import json
import requests
try:
import polling
except ImportError:
import sys
sys.tracebacklimit = 0
raise RuntimeError("Please install the python module 'polling' via pip or download it from https://github.com/justiniso/polling/")
from . import reCaptcha
class captchaSolver(reCaptcha):
def __init__(self):
super(captchaSolver, self).__init__('deathbycaptcha')
self.host = 'http://api.dbcapi.me/api'
self.session = requests.Session()
##########################################################################################################################################################
def checkErrorStatus(self, response):
errors = dict(
[
(400, "DeathByCaptcha: 400 Bad Request"),
(403, "DeathByCaptcha: 403 Forbidden - Invalid credentails or insufficient credits."),
# (500, "DeathByCaptcha: 500 Internal Server Error."),
(503, "DeathByCaptcha: 503 Service Temporarily Unavailable.")
]
)
if response.status_code in errors:
raise RuntimeError(errors.get(response.status_code))
##########################################################################################################################################################
def login(self, username, password):
self.username = username
self.password = password
def _checkRequest(response):
if response.status_code == 200:
if response.json().get('is_banned'):
raise RuntimeError('DeathByCaptcha: Your account is banned.')
if response.json().get('balanace') == 0:
raise RuntimeError('DeathByCaptcha: insufficient credits.')
return response
self.checkErrorStatus(response)
return None
response = polling.poll(
lambda: self.session.post(
'{}/user'.format(self.host),
headers={'Accept': 'application/json'},
data={
'username': self.username,
'password': self.password
}
),
check_success=_checkRequest,
step=10,
timeout=120
)
self.debugRequest(response)
##########################################################################################################################################################
def reportJob(self, jobID):
if not jobID:
raise RuntimeError("DeathByCaptcha: Error bad job id to report failed reCaptcha.")
def _checkRequest(response):
if response.status_code == 200:
return response
self.checkErrorStatus(response)
return None
response = polling.poll(
lambda: self.session.post(
'{}/captcha/{}/report'.format(self.host, jobID),
headers={'Accept': 'application/json'},
data={
'username': self.username,
'password': self.password
}
),
check_success=_checkRequest,
step=10,
timeout=180
)
if response:
return True
else:
raise RuntimeError("DeathByCaptcha: Error report failed reCaptcha.")
##########################################################################################################################################################
def requestJob(self, jobID):
if not jobID:
raise RuntimeError("DeathByCaptcha: Error bad job id to request reCaptcha.")
def _checkRequest(response):
if response.status_code in [200, 303] and response.json().get('text'):
return response
self.checkErrorStatus(response)
return None
response = polling.poll(
lambda: self.session.get(
'{}/captcha/{}'.format(self.host, jobID),
headers={'Accept': 'application/json'}
),
check_success=_checkRequest,
step=10,
timeout=180
)
if response:
return response.json().get('text')
else:
raise RuntimeError("DeathByCaptcha: Error failed to solve reCaptcha.")
##########################################################################################################################################################
def requestSolve(self, site_url, site_key):
def _checkRequest(response):
if response.status_code in [200, 303] and response.json().get("is_correct") and response.json().get('captcha'):
return response
self.checkErrorStatus(response)
return None
response = polling.poll(
lambda: self.session.post(
'{}/captcha'.format(self.host),
headers={'Accept': 'application/json'},
data={
'username': self.username,
'password': self.password,
'type': '4',
'token_params': json.dumps({
'googlekey': site_key,
'pageurl': site_url
})
},
allow_redirects=False
),
check_success=_checkRequest,
step=10,
timeout=180
)
if response:
return response.json().get('captcha')
else:
raise RuntimeError('DeathByCaptcha: Error no job id was returned.')
##########################################################################################################################################################
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
jobID = None
for param in ['username', 'password']:
if not reCaptchaParams.get(param):
raise ValueError("DeathByCaptcha: Missing '{}' parameter.".format(param))
setattr(self, param, reCaptchaParams.get(param))
if reCaptchaParams.get('proxy'):
self.session.proxies = reCaptchaParams.get('proxies')
try:
jobID = self.requestSolve(site_url, site_key)
return self.requestJob(jobID)
except polling.TimeoutException:
try:
if jobID:
self.reportJob(jobID)
except polling.TimeoutException:
raise RuntimeError("DeathByCaptcha: reCaptcha solve took to long and also failed reporting the job.")
raise RuntimeError("DeathByCaptcha: reCaptcha solve took to long to execute, aborting.")
captchaSolver()
+12 -5
View File
@@ -19,21 +19,28 @@ class User_Agent():
########################################################################################################################################################## ##########################################################################################################################################################
def loadUserAgent(self, *args, **kwargs): def loadUserAgent(self, *args, **kwargs):
browser = kwargs.pop('browser', 'chrome') browser = kwargs.pop('browser', None)
user_agents = json.load( user_agents = json.load(
open(os.path.join(os.path.dirname(__file__), 'browsers.json'), 'r'), open(os.path.join(os.path.dirname(__file__), 'browsers.json'), 'r'),
object_pairs_hook=OrderedDict object_pairs_hook=OrderedDict
) )
if not user_agents.get(browser): if browser and not user_agents.get(browser):
logging.error('Sorry "{}" browser User-Agent was not found.'.format(browser)) logging.error('Sorry "{}" browser User-Agent was not found.'.format(browser))
raise raise
user_agent_version = random.SystemRandom().choice(list(user_agents.get(browser))) if not browser:
browser = random.SystemRandom().choice(list(user_agents))
self.headers = user_agents.get(browser).get(user_agent_version).get('headers') user_agent_version = random.SystemRandom().choice(list(user_agents.get(browser).get('releases')))
self.headers['User-Agent'] = random.SystemRandom().choice(user_agents.get(browser).get(user_agent_version).get('User-Agent'))
if user_agents.get(browser).get('releases').get(user_agent_version).get('headers'):
self.headers = user_agents.get(browser).get('releases').get(user_agent_version).get('headers')
else:
self.headers = user_agents.get(browser).get('default_headers')
self.headers['User-Agent'] = random.SystemRandom().choice(user_agents.get(browser).get('releases').get(user_agent_version).get('User-Agent'))
if not kwargs.get('allow_brotli', False): if not kwargs.get('allow_brotli', False):
if 'br' in self.headers['Accept-Encoding']: if 'br' in self.headers['Accept-Encoding']:
File diff suppressed because it is too large Load Diff