Merge branch 'master' into master
This commit is contained in:
@@ -10,7 +10,6 @@ import urlparse
|
||||
|
||||
from platformcode import logger
|
||||
from decimal import Decimal
|
||||
from js2py.internals import seval
|
||||
|
||||
|
||||
class Cloudflare:
|
||||
@@ -47,25 +46,50 @@ class Cloudflare:
|
||||
logger.debug("Metodo #2 (headers): NO disponible")
|
||||
self.header_data = {}
|
||||
|
||||
|
||||
def solve_cf(self, body, domain):
|
||||
k = re.compile('<div style="display:none;visibility:hidden;" id=".*?">(.*?)<\/div>', re.DOTALL).findall(body)
|
||||
k1 = re.compile('function\(p\){var p = eval\(eval.*?atob.*?return \+\(p\)}\(\)', re.DOTALL).findall(body)
|
||||
if k1:
|
||||
body = body.replace(k1[0], k[0])
|
||||
js = re.search(r"setTimeout\(function\(\){\s+(var "
|
||||
"s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n", body).group(1)
|
||||
js = re.search(
|
||||
r"setTimeout\(function\(\){\s+(var s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n",
|
||||
body
|
||||
).group(1)
|
||||
|
||||
js = re.sub(r"a\.value = ((.+).toFixed\(10\))?", r"\1", js)
|
||||
js = re.sub(r'(e\s=\sfunction\(s\)\s{.*?};)', '', js, flags=re.DOTALL|re.MULTILINE)
|
||||
js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js).replace("t.length", str(len(domain)))
|
||||
js = js.replace('; 121', '')
|
||||
reemplazar = re.compile('(?is)function\(p\)\{return eval.*?\+p\+"\)"\)}', re.DOTALL).findall(js)
|
||||
if reemplazar:
|
||||
js = js.replace(reemplazar[0],'t.charCodeAt')
|
||||
js = re.sub(r"[\n\\']", "", js)
|
||||
js = 'a = {{}}; t = "{}";{}'.format(domain, js)
|
||||
result = seval.eval_js_vm(js)
|
||||
jsEnv = """
|
||||
var t = "{domain}";
|
||||
var g = String.fromCharCode;
|
||||
o = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
|
||||
e = function(s) {{
|
||||
s += "==".slice(2 - (s.length & 3));
|
||||
var bm, r = "", r1, r2, i = 0;
|
||||
for (; i < s.length;) {{
|
||||
bm = o.indexOf(s.charAt(i++)) << 18 | o.indexOf(s.charAt(i++)) << 12 | (r1 = o.indexOf(s.charAt(i++))) << 6 | (r2 = o.indexOf(s.charAt(i++)));
|
||||
r += r1 === 64 ? g(bm >> 16 & 255) : r2 === 64 ? g(bm >> 16 & 255, bm >> 8 & 255) : g(bm >> 16 & 255, bm >> 8 & 255, bm & 255);
|
||||
}}
|
||||
return r;
|
||||
}};
|
||||
function italics (str) {{ return '<i>' + this + '</i>'; }};
|
||||
var document = {{
|
||||
getElementById: function () {{
|
||||
return {{'innerHTML': '{innerHTML}'}};
|
||||
}}
|
||||
}};
|
||||
{js}
|
||||
"""
|
||||
innerHTML = re.search('<div(?: [^<>]*)? id="([^<>]*?)">([^<>]*?)<\/div>', body , re.MULTILINE | re.DOTALL)
|
||||
innerHTML = innerHTML.group(2).replace("'", r"\'") if innerHTML else ""
|
||||
import js2py
|
||||
from jsc import jsunc
|
||||
js = jsunc(jsEnv.format(domain=domain, innerHTML=innerHTML, js=js))
|
||||
def atob(s):
|
||||
return base64.b64decode('{}'.format(s)).decode('utf-8')
|
||||
js2py.disable_pyimport()
|
||||
context = js2py.EvalJs({'atob': atob})
|
||||
result = context.eval(js)
|
||||
return float(result)
|
||||
|
||||
|
||||
|
||||
@property
|
||||
def wait_time(self):
|
||||
|
||||
@@ -32,7 +32,6 @@ import urlparse
|
||||
from StringIO import StringIO
|
||||
from threading import Lock
|
||||
|
||||
from core.cloudflare import Cloudflare
|
||||
from platformcode import config, logger
|
||||
from platformcode.logger import WebErrorException
|
||||
|
||||
@@ -76,21 +75,21 @@ def get_url_headers(url):
|
||||
return url + "|" + "&".join(["%s=%s" % (h, headers[h]) for h in headers])
|
||||
|
||||
|
||||
def load_cookies():
|
||||
def load_cookies(alfa_s=False):
|
||||
cookies_lock.acquire()
|
||||
if os.path.isfile(ficherocookies):
|
||||
logger.info("Leyendo fichero cookies")
|
||||
if not alfa_s: logger.info("Leyendo fichero cookies")
|
||||
try:
|
||||
cj.load(ficherocookies, ignore_discard=True)
|
||||
except:
|
||||
logger.info("El fichero de cookies existe pero es ilegible, se borra")
|
||||
if not alfa_s: logger.info("El fichero de cookies existe pero es ilegible, se borra")
|
||||
os.remove(ficherocookies)
|
||||
cookies_lock.release()
|
||||
|
||||
|
||||
def save_cookies():
|
||||
def save_cookies(alfa_s=False):
|
||||
cookies_lock.acquire()
|
||||
logger.info("Guardando cookies...")
|
||||
if not alfa_s: logger.info("Guardando cookies...")
|
||||
cj.save(ficherocookies, ignore_discard=True)
|
||||
cookies_lock.release()
|
||||
|
||||
@@ -99,7 +98,7 @@ load_cookies()
|
||||
|
||||
|
||||
def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=True, cookies=True, replace_headers=False,
|
||||
add_referer=False, only_headers=False, bypass_cloudflare=True, count_retries=0, count_retries_tot=5, random_headers=False, ignore_response_code=False, alfa_s=False, proxy=True, proxy_web=False, forced_proxy=None, proxy_retries=1):
|
||||
add_referer=False, only_headers=False, bypass_cloudflare=True, count_retries=0, count_retries_tot=5, random_headers=False, ignore_response_code=False, alfa_s=False, proxy=True, proxy_web=False, proxy_addr_forced=None,forced_proxy=None, proxy_retries=1):
|
||||
"""
|
||||
Abre una url y retorna los datos obtenidos
|
||||
|
||||
@@ -174,19 +173,28 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
|
||||
proxy_CF_addr = ''
|
||||
proxy_web_name = ''
|
||||
proxy_log = ''
|
||||
import proxytools
|
||||
|
||||
try:
|
||||
if (proxy or proxy_web) and (forced_proxy or proxytools.channel_proxy_list(url, forced_proxy=forced_proxy)):
|
||||
if (proxy or proxy_web) and (forced_proxy or proxy_addr_forced or channel_proxy_list(url, forced_proxy=forced_proxy)):
|
||||
import proxytools
|
||||
proxy_addr, proxy_CF_addr, proxy_web_name, proxy_log = proxytools.get_proxy_addr(url, post=post, forced_proxy=forced_proxy)
|
||||
if proxy_addr_forced and proxy_log:
|
||||
import scrapertools
|
||||
proxy_log = scrapertools.find_single_match(str(proxy_addr_forced), "{'http.*':\s*'(.*?)'}")
|
||||
|
||||
if proxy and proxy_addr:
|
||||
if proxy_addr_forced: proxy_addr = proxy_addr_forced
|
||||
handlers.append(urllib2.ProxyHandler(proxy_addr))
|
||||
proxy_stat = ', Proxy Direct ' + proxy_log
|
||||
elif proxy and proxy_CF_addr:
|
||||
if proxy_addr_forced: proxy_CF_addr = proxy_addr_forced
|
||||
handlers.append(urllib2.ProxyHandler(proxy_CF_addr))
|
||||
proxy_stat = ', Proxy CF ' + proxy_log
|
||||
elif proxy and not proxy_addr and not proxy_CF_addr:
|
||||
elif proxy and proxy_addr_forced:
|
||||
proxy_addr = proxy_addr_forced
|
||||
handlers.append(urllib2.ProxyHandler(proxy_addr))
|
||||
proxy_stat = ', Proxy Direct ' + proxy_log
|
||||
elif proxy and not proxy_addr and not proxy_CF_addr and not proxy_addr_forced:
|
||||
proxy = False
|
||||
if not proxy_web_name:
|
||||
proxy_addr, proxy_CF_addr, proxy_web_name, proxy_log = proxytools.get_proxy_addr(url, forced_proxy='Total')
|
||||
@@ -335,7 +343,7 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
|
||||
raise WebErrorException(urlparse.urlparse(url)[1])
|
||||
|
||||
if cookies:
|
||||
save_cookies()
|
||||
save_cookies(alfa_s=alfa_s)
|
||||
|
||||
if not alfa_s:
|
||||
logger.info("Encoding: %s" % (response["headers"].get('content-encoding')))
|
||||
@@ -362,6 +370,7 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
|
||||
|
||||
# Anti Cloudflare
|
||||
if bypass_cloudflare and count_retries < count_retries_tot:
|
||||
from core.cloudflare import Cloudflare
|
||||
cf = Cloudflare(response)
|
||||
if cf.is_cloudflare:
|
||||
count_retries += 1
|
||||
@@ -370,15 +379,15 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
|
||||
auth_url = cf.get_url()
|
||||
if not alfa_s:
|
||||
logger.info("Autorizando... intento %d url: %s" % (count_retries, auth_url))
|
||||
tt = downloadpage(auth_url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web)
|
||||
tt = downloadpage(auth_url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web, forced_proxy=forced_proxy, proxy_addr_forced=proxy_addr_forced, alfa_s=alfa_s)
|
||||
if tt.code == 403:
|
||||
tt = downloadpage(url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web)
|
||||
tt = downloadpage(url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web, forced_proxy=forced_proxy, proxy_addr_forced=proxy_addr_forced, alfa_s=alfa_s)
|
||||
if tt.sucess:
|
||||
if not alfa_s:
|
||||
logger.info("Autorización correcta, descargando página")
|
||||
resp = downloadpage(url=response["url"], post=post, headers=headers, timeout=timeout,
|
||||
follow_redirects=follow_redirects, count_retries=count_retries,
|
||||
cookies=cookies, replace_headers=replace_headers, add_referer=add_referer, proxy=proxy, proxy_web=proxy_web, count_retries_tot=count_retries_tot)
|
||||
cookies=cookies, replace_headers=replace_headers, add_referer=add_referer, proxy=proxy, proxy_web=proxy_web, count_retries_tot=count_retries_tot, forced_proxy=forced_proxy, proxy_addr_forced=proxy_addr_forced, alfa_s=alfa_s)
|
||||
response["sucess"] = resp.sucess
|
||||
response["code"] = resp.code
|
||||
response["error"] = resp.error
|
||||
@@ -435,6 +444,30 @@ def random_useragent():
|
||||
return UserAgentIem
|
||||
|
||||
return default_headers["User-Agent"]
|
||||
|
||||
|
||||
def channel_proxy_list(url, forced_proxy=None):
|
||||
import base64
|
||||
import ast
|
||||
import scrapertools
|
||||
|
||||
try:
|
||||
proxy_channel_bloqued_str = base64.b64decode(config.get_setting('proxy_channel_bloqued')).decode('utf-8')
|
||||
proxy_channel_bloqued = dict()
|
||||
proxy_channel_bloqued = ast.literal_eval(proxy_channel_bloqued_str)
|
||||
except:
|
||||
logger.debug('Proxytools no inicializado correctamente')
|
||||
return False
|
||||
|
||||
if not url.endswith('/'):
|
||||
url += '/'
|
||||
if scrapertools.find_single_match(url, '(?:http.*:\/\/)?([^\?|\/]+)(?:\?|\/)') in proxy_channel_bloqued:
|
||||
if forced_proxy:
|
||||
return True
|
||||
if 'ON' in proxy_channel_bloqued[scrapertools.find_single_match(url, '(?:http.*:\/\/)?([^\?|\/]+)(?:\?|\/)')]:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class NoRedirectHandler(urllib2.HTTPRedirectHandler):
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -10,25 +10,6 @@ from core import httptools
|
||||
from platformcode import logger
|
||||
|
||||
|
||||
def downloadpage(url, post=None, headers=None, follow_redirects=True, timeout=None, header_to_get=None):
|
||||
response = httptools.downloadpage(url, post=post, headers=headers, follow_redirects=follow_redirects,
|
||||
timeout=timeout)
|
||||
if header_to_get:
|
||||
return response.headers.get(header_to_get)
|
||||
else:
|
||||
return response.data
|
||||
|
||||
|
||||
def downloadpageGzip(url):
|
||||
response = httptools.downloadpage(url, add_referer=True)
|
||||
return response.data
|
||||
|
||||
|
||||
def getLocationHeaderFromResponse(url):
|
||||
response = httptools.downloadpage(url, only_headers=True)
|
||||
return response.headers.get("location")
|
||||
|
||||
|
||||
def get_header_from_response(url, header_to_get="", post=None, headers=None):
|
||||
header_to_get = header_to_get.lower()
|
||||
response = httptools.downloadpage(url, post=post, headers=headers, only_headers=True)
|
||||
@@ -48,11 +29,6 @@ def printMatches(matches):
|
||||
i = i + 1
|
||||
|
||||
|
||||
def get_match(data, patron, index=0):
|
||||
matches = re.findall(patron, data, flags=re.DOTALL)
|
||||
return matches[index]
|
||||
|
||||
|
||||
def find_single_match(data, patron, index=0):
|
||||
try:
|
||||
matches = re.findall(patron, data, flags=re.DOTALL)
|
||||
|
||||
@@ -18,10 +18,6 @@ def printMatches(matches):
|
||||
i = i + 1
|
||||
|
||||
|
||||
def get_match(data, patron, index=0):
|
||||
return find_single_match(data, patron, index=0)
|
||||
|
||||
|
||||
def find_single_match(data, patron, index=0):
|
||||
try:
|
||||
matches = re.findall(patron, data, flags=re.DOTALL)
|
||||
|
||||
Reference in New Issue
Block a user