Aggiornato cloudscraper

2020-04-05 10:45:28 +02:00
parent 34be96127b
commit 55c6ac7c8f
7 changed files with 178 additions and 81 deletions
--- a/lib/cloudscraper/init.py
+++ b/lib/cloudscraper/init.py
@@ -54,7 +54,7 @@ except ImportError:

 # ------------------------------------------------------------------------------- #

-__version__ = '1.2.30'
+__version__ = '1.2.32'

 # ------------------------------------------------------------------------------- #

@@ -148,6 +148,15 @@ class CloudScraper(Session):
    def __getstate__(self):
        return self.__dict__

+    # ------------------------------------------------------------------------------- #
+    # Raise an Exception with no stacktrace and reset depth counter.
+    # ------------------------------------------------------------------------------- #
+
+    def simpleException(self, exception, msg):
+        self._solveDepthCnt = 0
+        sys.tracebacklimit = 0
+        raise exception(msg)
+
    # ------------------------------------------------------------------------------- #
    # debug the request via the response
    # ------------------------------------------------------------------------------- #
@@ -219,9 +228,8 @@ class CloudScraper(Session):

            if self._solveDepthCnt >= self.solveDepth:
                _ = self._solveDepthCnt
-                self._solveDepthCnt = 0
-                sys.tracebacklimit = 0
-                raise CloudflareLoopProtection(
+                self.simpleException(
+                    CloudflareLoopProtection,
                    "!!Loop Protection!! We have tried to solve {} time(s) in a row.".format(_)
                )

@@ -303,8 +311,10 @@ class CloudScraper(Session):

    def is_Challenge_Request(self, resp):
        if self.is_Firewall_Blocked(resp):
-            sys.tracebacklimit = 0
-            raise CloudflareCode1020('Cloudflare has blocked this request (Code 1020 Detected).')
+            self.simpleException(
+                CloudflareCode1020,
+                'Cloudflare has blocked this request (Code 1020 Detected).'
+            )

        if self.is_reCaptcha_Challenge(resp) or self.is_IUAM_Challenge(resp):
            return True
@@ -317,16 +327,29 @@ class CloudScraper(Session):

    def IUAM_Challenge_Response(self, body, url, interpreter):
        try:
-            challengeUUID = re.search(
-                r'id="challenge-form" action="(?P<challengeUUID>\S+)"',
-                body, re.M | re.DOTALL
-            ).groupdict().get('challengeUUID', '')
+            formPayload = re.search(
+                r'<form (?P<form>id="challenge-form" action="(?P<challengeUUID>.*?'
+                r'__cf_chl_jschl_tk__=\S+)"(.*?)</form>)',
+                body,
+                re.M | re.DOTALL
+            ).groupdict()

-            payload = OrderedDict(re.findall(r'name="(r|jschl_vc|pass)"\svalue="(.*?)"', body))
+            if not all(key in formPayload for key in ['form', 'challengeUUID']):
+                self.simpleException(
+                    CloudflareIUAMError,
+                    "Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly."
+                )
+
+            payload = OrderedDict(
+                re.findall(
+                    r'name="(r|jschl_vc|pass)"\svalue="(.*?)"',
+                    formPayload['form']
+                )
+            )

        except AttributeError:
-            sys.tracebacklimit = 0
-            raise CloudflareIUAMError(
+            self.simpleException(
+                CloudflareIUAMError,
                "Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly."
            )

@@ -337,8 +360,8 @@ class CloudScraper(Session):
                interpreter
            ).solveChallenge(body, hostParsed.netloc)
        except Exception as e:
-            sys.tracebacklimit = 0
-            raise CloudflareIUAMError(
+            self.simpleException(
+                CloudflareIUAMError,
                'Unable to parse Cloudflare anti-bots page: {}'.format(
                    getattr(e, 'message', e)
                )
@@ -348,7 +371,7 @@ class CloudScraper(Session):
            'url': '{}://{}{}'.format(
                hostParsed.scheme,
                hostParsed.netloc,
-                self.unescape(challengeUUID)
+                self.unescape(formPayload['challengeUUID'])
            ),
            'data': payload
        }
@@ -359,34 +382,62 @@ class CloudScraper(Session):

    def reCaptcha_Challenge_Response(self, provider, provider_params, body, url):
        try:
-            payload = re.search(
-                r'(name="r"\svalue="(?P<r>\S+)"|).*?challenge-form" action="(?P<challengeUUID>\S+)".*?'
-                r'data-ray="(?P<data_ray>\S+)".*?data-sitekey="(?P<site_key>\S+)"',
-                body, re.M | re.DOTALL
+            formPayload = re.search(
+                r'<form class="challenge-form" (?P<form>id="challenge-form" '
+                r'action="(?P<challengeUUID>.*?__cf_chl_captcha_tk__=\S+)"(.*?)</form>)',
+                body,
+                re.M | re.DOTALL
            ).groupdict()
-        except (AttributeError):
-            sys.tracebacklimit = 0
-            raise CloudflareReCaptchaError(
+
+            if not all(key in formPayload for key in ['form', 'challengeUUID']):
+                self.simpleException(
+                    CloudflareReCaptchaError,
+                    "Cloudflare reCaptcha detected, unfortunately we can't extract the parameters correctly."
+                )
+
+            payload = OrderedDict(
+                re.findall(
+                    r'(name="r"\svalue|data-ray|data-sitekey|name="cf_captcha_kind"\svalue)="(.*?)"',
+                    formPayload['form']
+                )
+            )
+
+            captchaType = 'reCaptcha' if payload['name="cf_captcha_kind" value'] == 're' else 'hCaptcha'
+
+        except (AttributeError, KeyError):
+            self.simpleException(
+                CloudflareReCaptchaError,
                "Cloudflare reCaptcha detected, unfortunately we can't extract the parameters correctly."
            )

+        captchaResponse = reCaptcha.dynamicImport(
+            provider.lower()
+        ).solveCaptcha(
+            captchaType,
+            url,
+            payload['data-sitekey'],
+            provider_params
+        )
+
+        dataPayload = OrderedDict([
+            ('r', payload.get('name="r" value', '')),
+            ('cf_captcha_kind', payload['name="cf_captcha_kind" value']),
+            ('id', payload.get('data-ray')),
+            ('g-recaptcha-response', captchaResponse)
+        ])
+
+        if captchaType == 'hCaptcha':
+            dataPayload.update({'h-captcha-response': captchaResponse})
+
        hostParsed = urlparse(url)
+
        return {
            'url': '{}://{}{}'.format(
                hostParsed.scheme,
                hostParsed.netloc,
-                self.unescape(payload.get('challengeUUID', ''))
+                self.unescape(formPayload['challengeUUID'])
            ),
-            'data': OrderedDict([
-                ('r', payload.get('r', '')),
-                ('id', payload.get('data_ray')),
-                (
-                    'g-recaptcha-response',
-                    reCaptcha.dynamicImport(
-                        provider.lower()
-                    ).solveCaptcha(url, payload.get('site_key'), provider_params)
-                )
-            ])
+            'data': dataPayload
        }

    # ------------------------------------------------------------------------------- #
@@ -412,8 +463,8 @@ class CloudScraper(Session):
            # ------------------------------------------------------------------------------- #

            if not self.recaptcha or not isinstance(self.recaptcha, dict) or not self.recaptcha.get('provider'):
-                sys.tracebacklimit = 0
-                raise CloudflareReCaptchaProvider(
+                self.simpleException(
+                    CloudflareReCaptchaProvider,
                    "Cloudflare reCaptcha detected, unfortunately you haven't loaded an anti reCaptcha provider "
                    "correctly via the 'recaptcha' parameter."
                )
@@ -448,8 +499,10 @@ class CloudScraper(Session):
                    if isinstance(delay, (int, float)):
                        self.delay = delay
                except (AttributeError, ValueError):
-                    sys.tracebacklimit = 0
-                    raise CloudflareIUAMError("Cloudflare IUAM possibility malformed, issue extracing delay value.")
+                    self.simpleException(
+                        CloudflareIUAMError,
+                        "Cloudflare IUAM possibility malformed, issue extracing delay value."
+                    )

            sleep(self.delay)

@@ -507,6 +560,7 @@ class CloudScraper(Session):

            if not challengeSubmitResponse.is_redirect:
                return challengeSubmitResponse
+
            else:
                cloudflare_kwargs = deepcopy(kwargs)
                cloudflare_kwargs['headers'] = updateAttr(
@@ -535,6 +589,7 @@ class CloudScraper(Session):
        # ------------------------------------------------------------------------------- #

        return self.request(resp.request.method, resp.url, **kwargs)
+
    # ------------------------------------------------------------------------------- #

    @classmethod
@@ -587,8 +642,8 @@ class CloudScraper(Session):
                cookie_domain = d
                break
        else:
-            sys.tracebacklimit = 0
-            raise CloudflareIUAMError(
+            cls.simpleException(
+                CloudflareIUAMError,
                "Unable to find Cloudflare cookies. Does the site actually "
                "have Cloudflare IUAM (I'm Under Attack Mode) enabled?"
            )
--- a/lib/cloudscraper/reCaptcha/2captcha.py
+++ b/lib/cloudscraper/reCaptcha/2captcha.py
@@ -2,7 +2,6 @@ from __future__ import absolute_import

 import requests

-
 from ..exceptions import (
    reCaptchaServiceUnavailable,
    reCaptchaAPIError,
@@ -81,7 +80,7 @@ class captchaSolver(reCaptcha):
            }
        }

-        if response.json().get('status') is False and response.json().get('request') in errors.get(request_type):
+        if response.json().get('status') == 0 and response.json().get('request') in errors.get(request_type):
            raise reCaptchaAPIError(
                '{} {}'.format(
                    response.json().get('request'),
@@ -113,7 +112,8 @@ class captchaSolver(reCaptcha):
                    'action': 'reportbad',
                    'id': jobID,
                    'json': '1'
-                }
+                },
+                timeout=30
            ),
            check_success=_checkRequest,
            step=5,
@@ -149,7 +149,8 @@ class captchaSolver(reCaptcha):
                    'action': 'get',
                    'id': jobID,
                    'json': '1'
-                }
+                },
+                timeout=30
            ),
            check_success=_checkRequest,
            step=5,
@@ -165,7 +166,7 @@ class captchaSolver(reCaptcha):

    # ------------------------------------------------------------------------------- #

-    def requestSolve(self, site_url, site_key):
+    def requestSolve(self, captchaType, url, siteKey):
        def _checkRequest(response):
            if response.ok and response.json().get("status") == 1 and response.json().get('request'):
                return response
@@ -174,18 +175,29 @@ class captchaSolver(reCaptcha):

            return None

+        data = {
+            'key': self.api_key,
+            'pageurl': url,
+            'json': 1,
+            'soft_id': 5507698
+        }
+
+        data.update(
+            {
+                'method': 'userrcaptcha',
+                'googlekey': siteKey
+            } if captchaType == 'reCaptcha' else {
+                'method': 'hcaptcha',
+                'sitekey': siteKey
+            }
+        )
+
        response = polling.poll(
            lambda: self.session.post(
                '{}/in.php'.format(self.host),
-                data={
-                    'key': self.api_key,
-                    'method': 'userrecaptcha',
-                    'googlekey': site_key,
-                    'pageurl': site_url,
-                    'json': '1',
-                    'soft_id': '5507698'
-                },
-                allow_redirects=False
+                data=data,
+                allow_redirects=False,
+                timeout=30
            ),
            check_success=_checkRequest,
            step=5,
@@ -201,7 +213,7 @@ class captchaSolver(reCaptcha):

    # ------------------------------------------------------------------------------- #

-    def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
+    def getCaptchaAnswer(self, captchaType, url, siteKey, reCaptchaParams):
        jobID = None

        if not reCaptchaParams.get('api_key'):
@@ -215,7 +227,7 @@ class captchaSolver(reCaptcha):
            self.session.proxies = reCaptchaParams.get('proxies')

        try:
-            jobID = self.requestSolve(site_url, site_key)
+            jobID = self.requestSolve(captchaType, url, siteKey)
            return self.requestJob(jobID)
        except polling.TimeoutException:
            try:
--- a/lib/cloudscraper/reCaptcha/9kw.py
+++ b/lib/cloudscraper/reCaptcha/9kw.py
@@ -12,6 +12,7 @@ except ImportError:
    )

 from ..exceptions import (
+    reCaptchaException,
    reCaptchaServiceUnavailable,
    reCaptchaAPIError,
    reCaptchaTimeout,
@@ -143,7 +144,7 @@ class captchaSolver(reCaptcha):

    # ------------------------------------------------------------------------------- #

-    def requestSolve(self, site_url, site_key):
+    def requestSolve(self, url, siteKey):
        def _checkRequest(response):
            if response.ok and response.text.startswith('{') and response.json().get('captchaid'):
                return response
@@ -159,9 +160,9 @@ class captchaSolver(reCaptcha):
                    'apikey': self.api_key,
                    'action': 'usercaptchaupload',
                    'interactive': 1,
-                    'file-upload-01': site_key,
+                    'file-upload-01': siteKey,
                    'oldsource': 'recaptchav2',
-                    'pageurl': site_url,
+                    'pageurl': url,
                    'maxtimeout': self.maxtimeout,
                    'json': 1
                },
@@ -179,12 +180,17 @@ class captchaSolver(reCaptcha):

    # ------------------------------------------------------------------------------- #

-    def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
+    def getCaptchaAnswer(self, captchaType, url, siteKey, reCaptchaParams):
        jobID = None

        if not reCaptchaParams.get('api_key'):
            raise reCaptchaParameter("9kw: Missing api_key parameter.")

+        if captchaType == 'hCaptcha':
+            raise reCaptchaException(
+                'Provider does not support hCaptcha.'
+            )
+
        self.api_key = reCaptchaParams.get('api_key')

        if reCaptchaParams.get('maxtimeout'):
@@ -194,7 +200,7 @@ class captchaSolver(reCaptcha):
            self.session.proxies = reCaptchaParams.get('proxies')

        try:
-            jobID = self.requestSolve(site_url, site_key)
+            jobID = self.requestSolve(url, siteKey)
            return self.requestJob(jobID)
        except polling.TimeoutException:
            raise reCaptchaTimeout(
--- a/lib/cloudscraper/reCaptcha/init.py
+++ b/lib/cloudscraper/reCaptcha/init.py
@@ -37,10 +37,10 @@ class reCaptcha(ABC):
    # ------------------------------------------------------------------------------- #

    @abc.abstractmethod
-    def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
+    def getCaptchaAnswer(self, captchaType, url, siteKey, reCaptchaParams):
        pass

    # ------------------------------------------------------------------------------- #

-    def solveCaptcha(self, site_url, site_key, reCaptchaParams):
-        return self.getCaptchaAnswer(site_url, site_key, reCaptchaParams)
+    def solveCaptcha(self, captchaType, url, siteKey, reCaptchaParams):
+        return self.getCaptchaAnswer(captchaType, url, siteKey, reCaptchaParams)
--- a/lib/cloudscraper/reCaptcha/anticaptcha.py
+++ b/lib/cloudscraper/reCaptcha/anticaptcha.py
@@ -1,16 +1,22 @@
 from __future__ import absolute_import

-from ..exceptions import reCaptchaParameter
+from ..exceptions import (
+    reCaptchaParameter,
+    reCaptchaTimeout,
+    reCaptchaAPIError
+)

 try:
    from python_anticaptcha import (
        AnticaptchaClient,
-        NoCaptchaTaskProxylessTask
+        NoCaptchaTaskProxylessTask,
+        HCaptchaTaskProxyless,
+        AnticaptchaException
    )
 except ImportError:
    raise ImportError(
-        "Please install the python module 'python_anticaptcha' via pip or download it from "
-        "https://github.com/ad-m/python-anticaptcha"
+        "Please install/upgrade the python module 'python_anticaptcha' via "
+        "pip install python-anticaptcha or https://github.com/ad-m/python-anticaptcha/"
    )

 from . import reCaptcha
@@ -23,7 +29,7 @@ class captchaSolver(reCaptcha):

    # ------------------------------------------------------------------------------- #

-    def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
+    def getCaptchaAnswer(self, captchaType, url, siteKey, reCaptchaParams):
        if not reCaptchaParams.get('api_key'):
            raise reCaptchaParameter("anticaptcha: Missing api_key parameter.")

@@ -32,16 +38,30 @@ class captchaSolver(reCaptcha):
        if reCaptchaParams.get('proxy'):
            client.session.proxies = reCaptchaParams.get('proxies')

-        task = NoCaptchaTaskProxylessTask(site_url, site_key)
+        captchaMap = {
+            'reCaptcha': NoCaptchaTaskProxylessTask,
+            'hCaptcha': HCaptchaTaskProxyless
+        }
+
+        task = captchaMap[captchaType](url, siteKey)

        if not hasattr(client, 'createTaskSmee'):
            raise NotImplementedError(
                "Please upgrade 'python_anticaptcha' via pip or download it from "
-                "https://github.com/ad-m/python-anticaptcha"
+                "https://github.com/ad-m/python-anticaptcha/tree/hcaptcha"
            )

        job = client.createTaskSmee(task)
-        return job.get_solution_response()
+
+        try:
+            job.join(maximum_time=180)
+        except (AnticaptchaException) as e:
+            raise reCaptchaTimeout('{}'.format(getattr(e, 'message', e)))
+
+        if 'solution' in job._last_result:
+            return job.get_solution_response()
+        else:
+            raise reCaptchaAPIError('Job did not return `solution` key in payload.')


 # ------------------------------------------------------------------------------- #
--- a/lib/cloudscraper/reCaptcha/deathbycaptcha.py
+++ b/lib/cloudscraper/reCaptcha/deathbycaptcha.py
@@ -12,6 +12,7 @@ except ImportError:
    )

 from ..exceptions import (
+    reCaptchaException,
    reCaptchaServiceUnavailable,
    reCaptchaAccountError,
    reCaptchaTimeout,
@@ -154,7 +155,7 @@ class captchaSolver(reCaptcha):

    # ------------------------------------------------------------------------------- #

-    def requestSolve(self, site_url, site_key):
+    def requestSolve(self, url, siteKey):
        def _checkRequest(response):
            if response.ok and response.json().get("is_correct") and response.json().get('captcha'):
                return response
@@ -172,8 +173,8 @@ class captchaSolver(reCaptcha):
                    'password': self.password,
                    'type': '4',
                    'token_params': json.dumps({
-                        'googlekey': site_key,
-                        'pageurl': site_url
+                        'googlekey': siteKey,
+                        'pageurl': url
                    })
                },
                allow_redirects=False
@@ -192,7 +193,7 @@ class captchaSolver(reCaptcha):

    # ------------------------------------------------------------------------------- #

-    def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
+    def getCaptchaAnswer(self, captchaType, url, siteKey, reCaptchaParams):
        jobID = None

        for param in ['username', 'password']:
@@ -202,11 +203,16 @@ class captchaSolver(reCaptcha):
                )
            setattr(self, param, reCaptchaParams.get(param))

+        if captchaType == 'hCaptcha':
+            raise reCaptchaException(
+                'Provider does not support hCaptcha.'
+            )
+
        if reCaptchaParams.get('proxy'):
            self.session.proxies = reCaptchaParams.get('proxies')

        try:
-            jobID = self.requestSolve(site_url, site_key)
+            jobID = self.requestSolve(url, siteKey)
            return self.requestJob(jobID)
        except polling.TimeoutException:
            try:
--- a/lib/cloudscraper/user_agent/browsers.json
+++ b/lib/cloudscraper/user_agent/browsers.json
@@ -19,8 +19,7 @@
            "ECDHE-RSA-CHACHA20-POLY1305",
            "AES128-GCM-SHA256",
            "AES256-GCM-SHA384",
-            "AES128-SHA",
-            "AES256-SHA"
+            "AES128-SHA"
        ],
        "releases": {
            "Chrome/50.0.0.0": {
@@ -12825,8 +12824,7 @@
            "ECDHE-ECDSA-AES128-SHA",
            "DHE-RSA-AES128-SHA",
            "DHE-RSA-AES256-SHA",
-            "AES128-SHA",
-            "AES256-SHA"
+            "AES128-SHA"
        ],
        "releases": {
            "Firefox/50.0": {