diff --git a/lib/cloudscraper/__init__.py b/lib/cloudscraper/__init__.py index eb957a63..d10d2f73 100644 --- a/lib/cloudscraper/__init__.py +++ b/lib/cloudscraper/__init__.py @@ -1,8 +1,26 @@ +# ------------------------------------------------------------------------------- # + import logging import re +import requests import sys import ssl -import requests + +from collections import OrderedDict +from copy import deepcopy + +from requests.adapters import HTTPAdapter +from requests.sessions import Session +from requests_toolbelt.utils import dump + +from time import sleep + +# ------------------------------------------------------------------------------- # + +try: + import brotli +except ImportError: + pass try: import copyreg @@ -17,12 +35,12 @@ except ImportError: else: from html.parser import HTMLParser -from copy import deepcopy -from time import sleep -from collections import OrderedDict +try: + from urlparse import urlparse, urljoin +except ImportError: + from urllib.parse import urlparse, urljoin -from requests.sessions import Session -from requests.adapters import HTTPAdapter +# ------------------------------------------------------------------------------- # from .exceptions import ( CloudflareLoopProtection, @@ -37,25 +55,9 @@ from .interpreters import JavaScriptInterpreter from .reCaptcha import reCaptcha from .user_agent import User_Agent -try: - from requests_toolbelt.utils import dump -except ImportError: - pass - -try: - import brotli -except ImportError: - pass - -try: - from urlparse import urlparse, urljoin -except ImportError: - from urllib.parse import urlparse, urljoin - - # ------------------------------------------------------------------------------- # -__version__ = '1.2.36' +__version__ = '1.2.40' # ------------------------------------------------------------------------------- # @@ -107,6 +109,9 @@ class CloudScraper(Session): self.ssl_context = kwargs.pop('ssl_context', None) self.interpreter = kwargs.pop('interpreter', 'native') self.recaptcha = kwargs.pop('recaptcha', {}) + self.requestPreHook = kwargs.pop('requestPreHook', None) + self.requestPostHook = kwargs.pop('requestPostHook', None) + self.allow_brotli = kwargs.pop( 'allow_brotli', True if 'brotli' in sys.modules.keys() else False @@ -213,19 +218,46 @@ class CloudScraper(Session): if kwargs.get('proxies') and kwargs.get('proxies') != self.proxies: self.proxies = kwargs.get('proxies') - resp = self.decodeBrotli( + # ------------------------------------------------------------------------------- # + # Pre-Hook the request via user defined function. + # ------------------------------------------------------------------------------- # + + if self.requestPreHook: + (method, url, args, kwargs) = self.requestPreHook( + self, + method, + url, + *args, + **kwargs + ) + + # ------------------------------------------------------------------------------- # + # Make the request via requests. + # ------------------------------------------------------------------------------- # + + response = self.decodeBrotli( super(CloudScraper, self).request(method, url, *args, **kwargs) ) # ------------------------------------------------------------------------------- # - # Debug request + # Debug the request via the Response object. # ------------------------------------------------------------------------------- # if self.debug: - self.debugRequest(resp) + self.debugRequest(response) + + # ------------------------------------------------------------------------------- # + # Post-Hook the request aka Post-Hook the response via user defined function. + # ------------------------------------------------------------------------------- # + + if self.requestPostHook: + response = self.requestPostHook(self, response) + + if self.debug: + self.debugRequest(response) # Check if Cloudflare anti-bot is on - if self.is_Challenge_Request(resp): + if self.is_Challenge_Request(response): # ------------------------------------------------------------------------------- # # Try to solve the challenge and send it back # ------------------------------------------------------------------------------- # @@ -239,12 +271,12 @@ class CloudScraper(Session): self._solveDepthCnt += 1 - resp = self.Challenge_Response(resp, **kwargs) + response = self.Challenge_Response(response, **kwargs) else: - if not resp.is_redirect and resp.status_code not in [429, 503]: + if not response.is_redirect and response.status_code not in [429, 503]: self._solveDepthCnt = 0 - return resp + return response # ------------------------------------------------------------------------------- # # check if the response contains a valid Cloudflare challenge @@ -259,7 +291,7 @@ class CloudScraper(Session): and re.search( r'
', formPayload['form']): + for challengeParam in re.findall(r'^\s*', formPayload['form'], re.M | re.S): inputPayload = dict(re.findall(r'(\S+)="(\S+)"', challengeParam)) if inputPayload.get('name') in ['r', 'jschl_vc', 'pass']: payload.update({inputPayload['name']: inputPayload['value']}) diff --git a/lib/cloudscraper/interpreters/__init__.py b/lib/cloudscraper/interpreters/__init__.py index 10955552..af937b4a 100644 --- a/lib/cloudscraper/interpreters/__init__.py +++ b/lib/cloudscraper/interpreters/__init__.py @@ -49,7 +49,7 @@ class JavaScriptInterpreter(ABC): def solveChallenge(self, body, domain): try: - return float(self.eval(body, domain)) + return '{0:.10f}'.format(float(self.eval(body, domain))) except Exception: raise CloudflareSolveError( 'Error trying to solve Cloudflare IUAM Javascript, they may have changed their technique.' diff --git a/lib/cloudscraper/interpreters/encapsulated.py b/lib/cloudscraper/interpreters/encapsulated.py index 98faf48f..d98fa236 100644 --- a/lib/cloudscraper/interpreters/encapsulated.py +++ b/lib/cloudscraper/interpreters/encapsulated.py @@ -9,32 +9,38 @@ def template(body, domain): try: js = re.search( - r'setTimeout\(function\(\){\s+(.*?a\.value = \S+)', + r'setTimeout\(function\(\){\s+(.*?a\.value\s*=\s*\S+toFixed\(10\);)', body, re.M | re.S ).group(1) except Exception: raise ValueError('Unable to identify Cloudflare IUAM Javascript on website. {}'.format(BUG_REPORT)) - jsEnv = ''' - String.prototype.italics=function(str) {{return "" + this + "";}}; + jsEnv = '''String.prototype.italics=function(str) {{return "" + this + "";}}; + var subVars= {{{subVars}}}; var document = {{ createElement: function () {{ return {{ firstChild: {{ href: "https://{domain}/" }} }} }}, - getElementById: function () {{ - return {{"innerHTML": "{innerHTML}"}}; + getElementById: function (str) {{ + return {{"innerHTML": subVars[str]}}; }} }}; ''' try: - innerHTML = re.search( - r']*)? id="([^<>]*?)">([^<>]*?)', - body, - re.MULTILINE | re.DOTALL + js = js.replace( + r"(setInterval(function(){}, 100),t.match(/https?:\/\//)[0]);", + r"t.match(/https?:\/\//)[0];" ) - innerHTML = innerHTML.group(2) if innerHTML else '' + + k = re.search(r" k\s*=\s*'(?P\S+)';", body).group('k') + r = re.compile(r'
\s*(?P[^<>]*)
'.format(k)) + + subVars = '' + for m in r.finditer(body): + subVars = '{}\n\t\t{}{}: {},\n'.format(subVars, k, m.group('id'), m.group('jsfuck')) + subVars = subVars[:-2] except: # noqa logging.error('Error extracting Cloudflare IUAM Javascript. {}'.format(BUG_REPORT)) @@ -46,7 +52,7 @@ def template(body, domain): ' ', jsEnv.format( domain=domain, - innerHTML=innerHTML + subVars=subVars ), re.MULTILINE | re.DOTALL ), diff --git a/lib/cloudscraper/interpreters/native.py b/lib/cloudscraper/interpreters/native.py index 94d238bb..f71474cf 100644 --- a/lib/cloudscraper/interpreters/native.py +++ b/lib/cloudscraper/interpreters/native.py @@ -100,8 +100,8 @@ class ChallengeInterpreter(JavaScriptInterpreter): # ------------------------------------------------------------------------------- # - def flatten(l): - return sum(map(flatten, l), []) if isinstance(l, list) else [l] + def flatten(lists): + return sum(map(flatten, lists), []) if isinstance(lists, list) else [lists] # ------------------------------------------------------------------------------- # @@ -114,6 +114,7 @@ class ChallengeInterpreter(JavaScriptInterpreter): # Hackery Parser for Math stack = [] bstack = [] + for i in flatten(pyparsing.nestedExpr().parseString(jsFuck).asList()): if i == '+': stack.append(bstack) @@ -152,13 +153,35 @@ class ChallengeInterpreter(JavaScriptInterpreter): try: jsfuckChallenge = re.search( r"setTimeout\(function\(\){\s+var.*?f,\s*(?P\w+).*?:(?P\S+)};" - r".*?\('challenge-form'\);\s+;(?P.*?a\.value)" - r"(?:.*id=\"cf-dn-.*?>(?P\S+)<)?", + r".*?\('challenge-form'\);.*?;(?P.*?a\.value)\s*=\s*\S+\.toFixed\(10\);", body, re.DOTALL | re.MULTILINE ).groupdict() except AttributeError: - raise CloudflareSolveError('There was an issue extracting the Cloudflare challenge.') + raise CloudflareSolveError('There was an issue extracting "jsfuckChallenge" from the Cloudflare challenge.') + + kJSFUCK = re.search(r'(;|)\s*k.=(?P\S+);', jsfuckChallenge['challenge'], re.S | re.M) + if kJSFUCK: + try: + kJSFUCK = jsfuckToNumber(kJSFUCK.group('kJSFUCK')) + except IndexError: + raise CloudflareSolveError('There was an issue extracting "kJSFUCK" from the Cloudflare challenge.') + + try: + kID = re.search(r"\s*k\s*=\s*'(?P\S+)';", body).group('kID') + except IndexError: + raise CloudflareSolveError('There was an issue extracting "kID" from the Cloudflare challenge.') + + try: + r = re.compile(r'
\s*(?P[^<>]*)
'.format(kID)) + + kValues = {} + for m in r.finditer(body): + kValues[int(m.group('id'))] = m.group('jsfuck') + + jsfuckChallenge['k'] = kValues[kJSFUCK] + except (AttributeError, IndexError): + raise CloudflareSolveError('There was an issue extracting "kValues" from the Cloudflare challenge.') jsfuckChallenge['challenge'] = re.finditer( r'{}.*?([+\-*/])=(.*?);(?=a\.value|{})'.format( @@ -193,8 +216,8 @@ class ChallengeInterpreter(JavaScriptInterpreter): # ------------------------------------------------------------------------------- # - if not jsfuckChallenge['k'] and '+ t.length' in body: - jschl_answer += len(domain) + # if not jsfuckChallenge['k'] and '+ t.length' in body: + # jschl_answer += len(domain) # ------------------------------------------------------------------------------- #