Aggiornato cloudscraper

This commit is contained in:
Alhaziel01
2020-05-27 11:33:16 +02:00
parent 6e5f3389e2
commit 5f914c191c
4 changed files with 114 additions and 53 deletions

View File

@@ -1,8 +1,26 @@
# ------------------------------------------------------------------------------- #
import logging
import re
import requests
import sys
import ssl
import requests
from collections import OrderedDict
from copy import deepcopy
from requests.adapters import HTTPAdapter
from requests.sessions import Session
from requests_toolbelt.utils import dump
from time import sleep
# ------------------------------------------------------------------------------- #
try:
import brotli
except ImportError:
pass
try:
import copyreg
@@ -17,12 +35,12 @@ except ImportError:
else:
from html.parser import HTMLParser
from copy import deepcopy
from time import sleep
from collections import OrderedDict
try:
from urlparse import urlparse, urljoin
except ImportError:
from urllib.parse import urlparse, urljoin
from requests.sessions import Session
from requests.adapters import HTTPAdapter
# ------------------------------------------------------------------------------- #
from .exceptions import (
CloudflareLoopProtection,
@@ -37,25 +55,9 @@ from .interpreters import JavaScriptInterpreter
from .reCaptcha import reCaptcha
from .user_agent import User_Agent
try:
from requests_toolbelt.utils import dump
except ImportError:
pass
try:
import brotli
except ImportError:
pass
try:
from urlparse import urlparse, urljoin
except ImportError:
from urllib.parse import urlparse, urljoin
# ------------------------------------------------------------------------------- #
__version__ = '1.2.36'
__version__ = '1.2.40'
# ------------------------------------------------------------------------------- #
@@ -107,6 +109,9 @@ class CloudScraper(Session):
self.ssl_context = kwargs.pop('ssl_context', None)
self.interpreter = kwargs.pop('interpreter', 'native')
self.recaptcha = kwargs.pop('recaptcha', {})
self.requestPreHook = kwargs.pop('requestPreHook', None)
self.requestPostHook = kwargs.pop('requestPostHook', None)
self.allow_brotli = kwargs.pop(
'allow_brotli',
True if 'brotli' in sys.modules.keys() else False
@@ -213,19 +218,46 @@ class CloudScraper(Session):
if kwargs.get('proxies') and kwargs.get('proxies') != self.proxies:
self.proxies = kwargs.get('proxies')
resp = self.decodeBrotli(
# ------------------------------------------------------------------------------- #
# Pre-Hook the request via user defined function.
# ------------------------------------------------------------------------------- #
if self.requestPreHook:
(method, url, args, kwargs) = self.requestPreHook(
self,
method,
url,
*args,
**kwargs
)
# ------------------------------------------------------------------------------- #
# Make the request via requests.
# ------------------------------------------------------------------------------- #
response = self.decodeBrotli(
super(CloudScraper, self).request(method, url, *args, **kwargs)
)
# ------------------------------------------------------------------------------- #
# Debug request
# Debug the request via the Response object.
# ------------------------------------------------------------------------------- #
if self.debug:
self.debugRequest(resp)
self.debugRequest(response)
# ------------------------------------------------------------------------------- #
# Post-Hook the request aka Post-Hook the response via user defined function.
# ------------------------------------------------------------------------------- #
if self.requestPostHook:
response = self.requestPostHook(self, response)
if self.debug:
self.debugRequest(response)
# Check if Cloudflare anti-bot is on
if self.is_Challenge_Request(resp):
if self.is_Challenge_Request(response):
# ------------------------------------------------------------------------------- #
# Try to solve the challenge and send it back
# ------------------------------------------------------------------------------- #
@@ -239,12 +271,12 @@ class CloudScraper(Session):
self._solveDepthCnt += 1
resp = self.Challenge_Response(resp, **kwargs)
response = self.Challenge_Response(response, **kwargs)
else:
if not resp.is_redirect and resp.status_code not in [429, 503]:
if not response.is_redirect and response.status_code not in [429, 503]:
self._solveDepthCnt = 0
return resp
return response
# ------------------------------------------------------------------------------- #
# check if the response contains a valid Cloudflare challenge
@@ -259,7 +291,7 @@ class CloudScraper(Session):
and re.search(
r'<form .*?="challenge-form" action="/.*?__cf_chl_jschl_tk__=\S+"',
resp.text,
re.M | re.DOTALL
re.M | re.S
)
)
except AttributeError:
@@ -278,9 +310,9 @@ class CloudScraper(Session):
resp.headers.get('Server', '').startswith('cloudflare')
and resp.status_code in [429, 503]
and re.search(
r'cpo.src="/cdn-cgi/challenge-platform/orchestrate/jsch/v1"',
r'cpo.src\s*=\s*"/cdn-cgi/challenge-platform/orchestrate/jsch/v1"',
resp.text,
re.M | re.DOTALL
re.M | re.S
)
)
except AttributeError:
@@ -375,7 +407,7 @@ class CloudScraper(Session):
)
payload = OrderedDict()
for challengeParam in re.findall(r'<input\s(.*?)>', formPayload['form']):
for challengeParam in re.findall(r'^\s*<input\s(.*?)/>', formPayload['form'], re.M | re.S):
inputPayload = dict(re.findall(r'(\S+)="(\S+)"', challengeParam))
if inputPayload.get('name') in ['r', 'jschl_vc', 'pass']:
payload.update({inputPayload['name']: inputPayload['value']})

View File

@@ -49,7 +49,7 @@ class JavaScriptInterpreter(ABC):
def solveChallenge(self, body, domain):
try:
return float(self.eval(body, domain))
return '{0:.10f}'.format(float(self.eval(body, domain)))
except Exception:
raise CloudflareSolveError(
'Error trying to solve Cloudflare IUAM Javascript, they may have changed their technique.'

View File

@@ -9,32 +9,38 @@ def template(body, domain):
try:
js = re.search(
r'setTimeout\(function\(\){\s+(.*?a\.value = \S+)',
r'setTimeout\(function\(\){\s+(.*?a\.value\s*=\s*\S+toFixed\(10\);)',
body,
re.M | re.S
).group(1)
except Exception:
raise ValueError('Unable to identify Cloudflare IUAM Javascript on website. {}'.format(BUG_REPORT))
jsEnv = '''
String.prototype.italics=function(str) {{return "<i>" + this + "</i>";}};
jsEnv = '''String.prototype.italics=function(str) {{return "<i>" + this + "</i>";}};
var subVars= {{{subVars}}};
var document = {{
createElement: function () {{
return {{ firstChild: {{ href: "https://{domain}/" }} }}
}},
getElementById: function () {{
return {{"innerHTML": "{innerHTML}"}};
getElementById: function (str) {{
return {{"innerHTML": subVars[str]}};
}}
}};
'''
try:
innerHTML = re.search(
r'<div(?: [^<>]*)? id="([^<>]*?)">([^<>]*?)</div>',
body,
re.MULTILINE | re.DOTALL
js = js.replace(
r"(setInterval(function(){}, 100),t.match(/https?:\/\//)[0]);",
r"t.match(/https?:\/\//)[0];"
)
innerHTML = innerHTML.group(2) if innerHTML else ''
k = re.search(r" k\s*=\s*'(?P<k>\S+)';", body).group('k')
r = re.compile(r'<div id="{}(?P<id>\d+)">\s*(?P<jsfuck>[^<>]*)</div>'.format(k))
subVars = ''
for m in r.finditer(body):
subVars = '{}\n\t\t{}{}: {},\n'.format(subVars, k, m.group('id'), m.group('jsfuck'))
subVars = subVars[:-2]
except: # noqa
logging.error('Error extracting Cloudflare IUAM Javascript. {}'.format(BUG_REPORT))
@@ -46,7 +52,7 @@ def template(body, domain):
' ',
jsEnv.format(
domain=domain,
innerHTML=innerHTML
subVars=subVars
),
re.MULTILINE | re.DOTALL
),

View File

@@ -100,8 +100,8 @@ class ChallengeInterpreter(JavaScriptInterpreter):
# ------------------------------------------------------------------------------- #
def flatten(l):
return sum(map(flatten, l), []) if isinstance(l, list) else [l]
def flatten(lists):
return sum(map(flatten, lists), []) if isinstance(lists, list) else [lists]
# ------------------------------------------------------------------------------- #
@@ -114,6 +114,7 @@ class ChallengeInterpreter(JavaScriptInterpreter):
# Hackery Parser for Math
stack = []
bstack = []
for i in flatten(pyparsing.nestedExpr().parseString(jsFuck).asList()):
if i == '+':
stack.append(bstack)
@@ -152,13 +153,35 @@ class ChallengeInterpreter(JavaScriptInterpreter):
try:
jsfuckChallenge = re.search(
r"setTimeout\(function\(\){\s+var.*?f,\s*(?P<variable>\w+).*?:(?P<init>\S+)};"
r".*?\('challenge-form'\);\s+;(?P<challenge>.*?a\.value)"
r"(?:.*id=\"cf-dn-.*?>(?P<k>\S+)<)?",
r".*?\('challenge-form'\);.*?;(?P<challenge>.*?a\.value)\s*=\s*\S+\.toFixed\(10\);",
body,
re.DOTALL | re.MULTILINE
).groupdict()
except AttributeError:
raise CloudflareSolveError('There was an issue extracting the Cloudflare challenge.')
raise CloudflareSolveError('There was an issue extracting "jsfuckChallenge" from the Cloudflare challenge.')
kJSFUCK = re.search(r'(;|)\s*k.=(?P<kJSFUCK>\S+);', jsfuckChallenge['challenge'], re.S | re.M)
if kJSFUCK:
try:
kJSFUCK = jsfuckToNumber(kJSFUCK.group('kJSFUCK'))
except IndexError:
raise CloudflareSolveError('There was an issue extracting "kJSFUCK" from the Cloudflare challenge.')
try:
kID = re.search(r"\s*k\s*=\s*'(?P<kID>\S+)';", body).group('kID')
except IndexError:
raise CloudflareSolveError('There was an issue extracting "kID" from the Cloudflare challenge.')
try:
r = re.compile(r'<div id="{}(?P<id>\d+)">\s*(?P<jsfuck>[^<>]*)</div>'.format(kID))
kValues = {}
for m in r.finditer(body):
kValues[int(m.group('id'))] = m.group('jsfuck')
jsfuckChallenge['k'] = kValues[kJSFUCK]
except (AttributeError, IndexError):
raise CloudflareSolveError('There was an issue extracting "kValues" from the Cloudflare challenge.')
jsfuckChallenge['challenge'] = re.finditer(
r'{}.*?([+\-*/])=(.*?);(?=a\.value|{})'.format(
@@ -193,8 +216,8 @@ class ChallengeInterpreter(JavaScriptInterpreter):
# ------------------------------------------------------------------------------- #
if not jsfuckChallenge['k'] and '+ t.length' in body:
jschl_answer += len(domain)
# if not jsfuckChallenge['k'] and '+ t.length' in body:
# jschl_answer += len(domain)
# ------------------------------------------------------------------------------- #