Fix Parziale Cloudscraper

2019-11-30 17:56:42 +01:00
parent 092243fa99
commit d73034b348
2 changed files with 175 additions and 36 deletions
@@ -178,6 +178,26 @@ class CloudScraper(Session):
        return resp

    ##########################################################################################################################################################
+    # ------------------------------------------------------------------------------- #
+    # check if the response contains a valid Cloudflare reCaptcha challenge
+    # ------------------------------------------------------------------------------- #
+
+    @staticmethod
+    def is_reCaptcha_Challenge(resp):
+        try:
+            return (
+                resp.headers.get('Server', '').startswith('cloudflare')
+                and resp.status_code == 403
+                and re.search(
+                    r'action="/.*?__cf_chl_captcha_tk__=\S+".*?data\-sitekey=.*?',
+                    resp.text,
+                    re.M | re.DOTALL
+                )
+            )
+        except AttributeError:
+            pass
+
+        return False

    @staticmethod
    def isChallengeRequest(resp):
@@ -193,61 +213,177 @@ class CloudScraper(Session):

        return False

+
+    # ------------------------------------------------------------------------------- #
+    # Try to solve cloudflare javascript challenge.
+    # ------------------------------------------------------------------------------- #
+
+    @staticmethod
+    def IUAM_Challenge_Response(body, domain, interpreter):
+        try:
+            challengeUUID = re.search(
+                r'__cf_chl_jschl_tk__=(?P<challengeUUID>\S+)"',
+                body, re.M | re.DOTALL
+            ).groupdict().get('challengeUUID')
+            params = OrderedDict(re.findall(r'name="(r|jschl_vc|pass)"\svalue="(.*?)"', body))
+        except AttributeError:
+            sys.tracebacklimit = 0
+            raise RuntimeError("Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly.")
+
+        try:
+            params['jschl_answer'] = JavaScriptInterpreter.dynamicImport(
+                interpreter
+            ).solveChallenge(body, domain)
+        except Exception as e:
+            raise RuntimeError(
+                'Unable to parse Cloudflare anti-bots page: {}'.format(
+                    getattr(e, 'message', e)
+                )
+            )
+
+        return {
+            'url': 'https://{}/'.format(domain),
+            'params': {'__cf_chl_jschl_tk__': challengeUUID},
+            'data': params
+        }
+
+    @staticmethod
+    def reCaptcha_Challenge_Response(provider, provider_params, body, url):
+        try:
+            params = re.search(
+                r'(name="r"\svalue="(?P<r>\S+)"|).*?__cf_chl_captcha_tk__=(?P<challengeUUID>\S+)".*?'
+                r'data-ray="(?P<data_ray>\S+)".*?data-sitekey="(?P<site_key>\S+)"',
+                body, re.M | re.DOTALL
+            ).groupdict()
+        except (AttributeError):
+            sys.tracebacklimit = 0
+            raise RuntimeError(
+                "Cloudflare reCaptcha detected, unfortunately we can't extract the parameters correctly."
+            )
+
+        return {
+            'url': url,
+            'params': {'__cf_chl_captcha_tk__': params.get('challengeUUID')},
+            'data': OrderedDict([
+                ('r', ''),
+                ('id', params.get('data_ray')),
+                (
+                    'g-recaptcha-response',
+                    reCaptcha.dynamicImport(
+                        provider.lower()
+                    ).solveCaptcha(url, params.get('site_key'), provider_params)
+                )
+            ])
+        }
+
    ##########################################################################################################################################################

    def sendChallengeResponse(self, resp, **original_kwargs):
-        body = resp.text
+        if self.is_reCaptcha_Challenge(resp):
+            # ------------------------------------------------------------------------------- #
+            # double down on the request as some websites are only checking
+            # if cfuid is populated before issuing reCaptcha.
+            # ------------------------------------------------------------------------------- #

-        parsed_url = urlparse(resp.url)
-        domain = parsed_url.netloc
+            resp = self.decodeBrotli(
+                super(CloudScraper, self).request(resp.request.method, resp.url, **kwargs)
+            )

-        params = OrderedDict()
+            if not self.is_reCaptcha_Challenge(resp):
+                return resp

-        s = re.search(r'name="s"\svalue="(?P<s_value>[^"]+)', body)
-        if s:
-            params['s'] = s.group('s_value')
+            # ------------------------------------------------------------------------------- #
+            # if no reCaptcha provider raise a runtime error.
+            # ------------------------------------------------------------------------------- #

-        if b'/cdn-cgi/l/chk_captcha' in resp.content:
            if not self.recaptcha or not isinstance(self.recaptcha, dict) or not self.recaptcha.get('provider'):
                sys.tracebacklimit = 0
-                raise RuntimeError("Cloudflare reCaptcha detected, unfortunately you haven't loaded an anti reCaptcha provider correctly via the 'recaptcha' parameter.")
+                raise RuntimeError(
+                    "Cloudflare reCaptcha detected, unfortunately you haven't loaded an anti reCaptcha provider "
+                    "correctly via the 'recaptcha' parameter."
+                )
+
+            # ------------------------------------------------------------------------------- #
+            # if provider is return_response, return the response without doing anything.
+            # ------------------------------------------------------------------------------- #
+
+            if self.recaptcha.get('provider') == 'return_response':
+                return resp

-            submit_url = '{}://{}/cdn-cgi/l/chk_captcha'.format(parsed_url.scheme, domain)
            self.recaptcha['proxies'] = self.proxies
-            params['g-recaptcha-response'] = reCaptcha.dynamicImport(self.recaptcha.get('provider').lower()).solveCaptcha(resp, self.recaptcha)
+            submit_url = self.reCaptcha_Challenge_Response(
+                self.recaptcha.get('provider'),
+                self.recaptcha,
+                resp.text,
+                resp.url
+            )
        else:
+            # ------------------------------------------------------------------------------- #
            # Cloudflare requires a delay before solving the challenge
+            # ------------------------------------------------------------------------------- #
+
            if not self.delay:
                try:
-                    delay = float(re.search(r'submit\(\);\r?\n\s*},\s*([0-9]+)', body).group(1)) / float(1000)
+                    delay = float(
+                        re.search(
+                            r'submit\(\);\r?\n\s*},\s*([0-9]+)',
+                            resp.text
+                        ).group(1)
+                    ) / float(1000)
                    if isinstance(delay, (int, float)):
                        self.delay = delay
-                except:  # noqa
-                    pass
+                except (AttributeError, ValueError):
+                    sys.tracebacklimit = 0
+                    raise RuntimeError("Cloudflare IUAM possibility malformed, issue extracing delay value.")

            sleep(self.delay)
-            submit_url = '{}://{}/cdn-cgi/l/chk_jschl'.format(parsed_url.scheme, domain)
-            try:
-                params.update(
-                    [
-                        ('jschl_vc', re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)),
-                        ('pass', re.search(r'name="pass" value="(.+?)"', body).group(1)),
-                        ('jschl_answer', JavaScriptInterpreter.dynamicImport(self.interpreter).solveChallenge(body, domain))
-                    ]
-                )
-            except Exception as e:
-                raise ValueError('Unable to parse Cloudflare anti-bots page: {} {}'.format(e.message, BUG_REPORT))

-        # Requests transforms any request into a GET after a redirect,
-        # so the redirect has to be handled manually here to allow for
-        # performing other types of requests even as the first request.
+            # ------------------------------------------------------------------------------- #

-        cloudflare_kwargs = deepcopy(original_kwargs)
-        cloudflare_kwargs.setdefault('params', params)
-        cloudflare_kwargs['allow_redirects'] = False
-        self.request(resp.request.method, submit_url, **cloudflare_kwargs)
+            submit_url = self.IUAM_Challenge_Response(
+                resp.text,
+                urlparse(resp.url).netloc,
+                self.interpreter
+            )

-        return self.request(resp.request.method, resp.url, **original_kwargs)
+        # ------------------------------------------------------------------------------- #
+        # Send the Challenge Response back to Cloudflare
+        # ------------------------------------------------------------------------------- #
+
+        if submit_url:
+            def updateAttr(obj, name, newValue):
+                try:
+                    obj[name].update(newValue)
+                    return obj[name]
+                except (AttributeError, KeyError):
+                    obj[name] = {}
+                    obj[name].update(newValue)
+                    return obj[name]
+
+            cloudflare_kwargs = deepcopy(kwargs)
+            cloudflare_kwargs['allow_redirects'] = False
+            cloudflare_kwargs['params'] = updateAttr(cloudflare_kwargs, 'params', submit_url['params'])
+            cloudflare_kwargs['data'] = updateAttr(cloudflare_kwargs, 'data', submit_url['data'])
+            cloudflare_kwargs['headers'] = updateAttr(cloudflare_kwargs, 'headers', {'Referer': resp.url})
+
+            self.request(
+                'POST',
+                submit_url['url'],
+                **cloudflare_kwargs
+            )
+
+        # ------------------------------------------------------------------------------- #
+        # Request the original query request and return it
+        # ------------------------------------------------------------------------------- #
+
+        return self.request(resp.request.method, resp.url, **kwargs)
+
+        # ------------------------------------------------------------------------------- #
+        # Request the original query request and return it
+        # ------------------------------------------------------------------------------- #
+
+        return self.request(resp.request.method, resp.url, **kwargs)
+    # ------------------------------------------------------------------------------- #

    ##########################################################################################################################################################

@@ -15,9 +15,12 @@ class ChallengeInterpreter(JavaScriptInterpreter):
        super(ChallengeInterpreter, self).__init__('js2py')

    def eval(self, jsEnv, js):
-        if js2py.eval_js('(+(+!+[]+[+!+[]]+(!![]+[])[!+[]+!+[]+!+[]]+[!+[]+!+[]]+[+[]])+[])[+!+[]]') == '1':
-            logging.warning('WARNING - Please upgrade your js2py https://github.com/PiotrDabkowski/Js2Py, applying work around for the meantime.')
-            js = jsunfuck(js)
+        ### blocca lo script
+
+        # from core.support import dbg; dbg()
+        # if js2py.eval_js('(+(+!+[]+[+!+[]]+(!![]+[])[!+[]+!+[]+!+[]]+[!+[]+!+[]]+[+[]])+[])[+!+[]]') == '1':
+        #     logging.warning('WARNING - Please upgrade your js2py https://github.com/PiotrDabkowski/Js2Py, applying work around for the meantime.')
+        #     js = jsunfuck(js)

        def atob(s):
            return base64.b64decode('{}'.format(s)).decode('utf-8')