From d73034b348656050afd7387bf508bc89d03fee79 Mon Sep 17 00:00:00 2001 From: Alhaziel Date: Sat, 30 Nov 2019 17:56:42 +0100 Subject: [PATCH] Fix Parziale Cloudscraper --- lib/cloudscraper/__init__.py | 202 +++++++++++++++++++++---- lib/cloudscraper/interpreters/js2py.py | 9 +- 2 files changed, 175 insertions(+), 36 deletions(-) diff --git a/lib/cloudscraper/__init__.py b/lib/cloudscraper/__init__.py index 4debbe0b..bde3a74e 100644 --- a/lib/cloudscraper/__init__.py +++ b/lib/cloudscraper/__init__.py @@ -178,6 +178,26 @@ class CloudScraper(Session): return resp ########################################################################################################################################################## + # ------------------------------------------------------------------------------- # + # check if the response contains a valid Cloudflare reCaptcha challenge + # ------------------------------------------------------------------------------- # + + @staticmethod + def is_reCaptcha_Challenge(resp): + try: + return ( + resp.headers.get('Server', '').startswith('cloudflare') + and resp.status_code == 403 + and re.search( + r'action="/.*?__cf_chl_captcha_tk__=\S+".*?data\-sitekey=.*?', + resp.text, + re.M | re.DOTALL + ) + ) + except AttributeError: + pass + + return False @staticmethod def isChallengeRequest(resp): @@ -193,61 +213,177 @@ class CloudScraper(Session): return False + + # ------------------------------------------------------------------------------- # + # Try to solve cloudflare javascript challenge. + # ------------------------------------------------------------------------------- # + + @staticmethod + def IUAM_Challenge_Response(body, domain, interpreter): + try: + challengeUUID = re.search( + r'__cf_chl_jschl_tk__=(?P\S+)"', + body, re.M | re.DOTALL + ).groupdict().get('challengeUUID') + params = OrderedDict(re.findall(r'name="(r|jschl_vc|pass)"\svalue="(.*?)"', body)) + except AttributeError: + sys.tracebacklimit = 0 + raise RuntimeError("Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly.") + + try: + params['jschl_answer'] = JavaScriptInterpreter.dynamicImport( + interpreter + ).solveChallenge(body, domain) + except Exception as e: + raise RuntimeError( + 'Unable to parse Cloudflare anti-bots page: {}'.format( + getattr(e, 'message', e) + ) + ) + + return { + 'url': 'https://{}/'.format(domain), + 'params': {'__cf_chl_jschl_tk__': challengeUUID}, + 'data': params + } + + @staticmethod + def reCaptcha_Challenge_Response(provider, provider_params, body, url): + try: + params = re.search( + r'(name="r"\svalue="(?P\S+)"|).*?__cf_chl_captcha_tk__=(?P\S+)".*?' + r'data-ray="(?P\S+)".*?data-sitekey="(?P\S+)"', + body, re.M | re.DOTALL + ).groupdict() + except (AttributeError): + sys.tracebacklimit = 0 + raise RuntimeError( + "Cloudflare reCaptcha detected, unfortunately we can't extract the parameters correctly." + ) + + return { + 'url': url, + 'params': {'__cf_chl_captcha_tk__': params.get('challengeUUID')}, + 'data': OrderedDict([ + ('r', ''), + ('id', params.get('data_ray')), + ( + 'g-recaptcha-response', + reCaptcha.dynamicImport( + provider.lower() + ).solveCaptcha(url, params.get('site_key'), provider_params) + ) + ]) + } + ########################################################################################################################################################## def sendChallengeResponse(self, resp, **original_kwargs): - body = resp.text + if self.is_reCaptcha_Challenge(resp): + # ------------------------------------------------------------------------------- # + # double down on the request as some websites are only checking + # if cfuid is populated before issuing reCaptcha. + # ------------------------------------------------------------------------------- # - parsed_url = urlparse(resp.url) - domain = parsed_url.netloc + resp = self.decodeBrotli( + super(CloudScraper, self).request(resp.request.method, resp.url, **kwargs) + ) - params = OrderedDict() + if not self.is_reCaptcha_Challenge(resp): + return resp - s = re.search(r'name="s"\svalue="(?P[^"]+)', body) - if s: - params['s'] = s.group('s_value') + # ------------------------------------------------------------------------------- # + # if no reCaptcha provider raise a runtime error. + # ------------------------------------------------------------------------------- # - if b'/cdn-cgi/l/chk_captcha' in resp.content: if not self.recaptcha or not isinstance(self.recaptcha, dict) or not self.recaptcha.get('provider'): sys.tracebacklimit = 0 - raise RuntimeError("Cloudflare reCaptcha detected, unfortunately you haven't loaded an anti reCaptcha provider correctly via the 'recaptcha' parameter.") + raise RuntimeError( + "Cloudflare reCaptcha detected, unfortunately you haven't loaded an anti reCaptcha provider " + "correctly via the 'recaptcha' parameter." + ) + + # ------------------------------------------------------------------------------- # + # if provider is return_response, return the response without doing anything. + # ------------------------------------------------------------------------------- # + + if self.recaptcha.get('provider') == 'return_response': + return resp - submit_url = '{}://{}/cdn-cgi/l/chk_captcha'.format(parsed_url.scheme, domain) self.recaptcha['proxies'] = self.proxies - params['g-recaptcha-response'] = reCaptcha.dynamicImport(self.recaptcha.get('provider').lower()).solveCaptcha(resp, self.recaptcha) + submit_url = self.reCaptcha_Challenge_Response( + self.recaptcha.get('provider'), + self.recaptcha, + resp.text, + resp.url + ) else: + # ------------------------------------------------------------------------------- # # Cloudflare requires a delay before solving the challenge + # ------------------------------------------------------------------------------- # + if not self.delay: try: - delay = float(re.search(r'submit\(\);\r?\n\s*},\s*([0-9]+)', body).group(1)) / float(1000) + delay = float( + re.search( + r'submit\(\);\r?\n\s*},\s*([0-9]+)', + resp.text + ).group(1) + ) / float(1000) if isinstance(delay, (int, float)): self.delay = delay - except: # noqa - pass + except (AttributeError, ValueError): + sys.tracebacklimit = 0 + raise RuntimeError("Cloudflare IUAM possibility malformed, issue extracing delay value.") sleep(self.delay) - submit_url = '{}://{}/cdn-cgi/l/chk_jschl'.format(parsed_url.scheme, domain) - try: - params.update( - [ - ('jschl_vc', re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)), - ('pass', re.search(r'name="pass" value="(.+?)"', body).group(1)), - ('jschl_answer', JavaScriptInterpreter.dynamicImport(self.interpreter).solveChallenge(body, domain)) - ] - ) - except Exception as e: - raise ValueError('Unable to parse Cloudflare anti-bots page: {} {}'.format(e.message, BUG_REPORT)) - # Requests transforms any request into a GET after a redirect, - # so the redirect has to be handled manually here to allow for - # performing other types of requests even as the first request. + # ------------------------------------------------------------------------------- # - cloudflare_kwargs = deepcopy(original_kwargs) - cloudflare_kwargs.setdefault('params', params) - cloudflare_kwargs['allow_redirects'] = False - self.request(resp.request.method, submit_url, **cloudflare_kwargs) + submit_url = self.IUAM_Challenge_Response( + resp.text, + urlparse(resp.url).netloc, + self.interpreter + ) - return self.request(resp.request.method, resp.url, **original_kwargs) + # ------------------------------------------------------------------------------- # + # Send the Challenge Response back to Cloudflare + # ------------------------------------------------------------------------------- # + + if submit_url: + def updateAttr(obj, name, newValue): + try: + obj[name].update(newValue) + return obj[name] + except (AttributeError, KeyError): + obj[name] = {} + obj[name].update(newValue) + return obj[name] + + cloudflare_kwargs = deepcopy(kwargs) + cloudflare_kwargs['allow_redirects'] = False + cloudflare_kwargs['params'] = updateAttr(cloudflare_kwargs, 'params', submit_url['params']) + cloudflare_kwargs['data'] = updateAttr(cloudflare_kwargs, 'data', submit_url['data']) + cloudflare_kwargs['headers'] = updateAttr(cloudflare_kwargs, 'headers', {'Referer': resp.url}) + + self.request( + 'POST', + submit_url['url'], + **cloudflare_kwargs + ) + + # ------------------------------------------------------------------------------- # + # Request the original query request and return it + # ------------------------------------------------------------------------------- # + + return self.request(resp.request.method, resp.url, **kwargs) + + # ------------------------------------------------------------------------------- # + # Request the original query request and return it + # ------------------------------------------------------------------------------- # + + return self.request(resp.request.method, resp.url, **kwargs) + # ------------------------------------------------------------------------------- # ########################################################################################################################################################## diff --git a/lib/cloudscraper/interpreters/js2py.py b/lib/cloudscraper/interpreters/js2py.py index d3910461..57610b7a 100644 --- a/lib/cloudscraper/interpreters/js2py.py +++ b/lib/cloudscraper/interpreters/js2py.py @@ -15,9 +15,12 @@ class ChallengeInterpreter(JavaScriptInterpreter): super(ChallengeInterpreter, self).__init__('js2py') def eval(self, jsEnv, js): - if js2py.eval_js('(+(+!+[]+[+!+[]]+(!![]+[])[!+[]+!+[]+!+[]]+[!+[]+!+[]]+[+[]])+[])[+!+[]]') == '1': - logging.warning('WARNING - Please upgrade your js2py https://github.com/PiotrDabkowski/Js2Py, applying work around for the meantime.') - js = jsunfuck(js) + ### blocca lo script + + # from core.support import dbg; dbg() + # if js2py.eval_js('(+(+!+[]+[+!+[]]+(!![]+[])[!+[]+!+[]+!+[]]+[!+[]+!+[]]+[+[]])+[])[+!+[]]') == '1': + # logging.warning('WARNING - Please upgrade your js2py https://github.com/PiotrDabkowski/Js2Py, applying work around for the meantime.') + # js = jsunfuck(js) def atob(s): return base64.b64decode('{}'.format(s)).decode('utf-8')