Merge branch 'master' into master

This commit is contained in:
lozioangie
2019-04-11 21:29:25 +02:00
committed by GitHub
44 changed files with 1735 additions and 330 deletions
+38 -14
View File
@@ -10,7 +10,6 @@ import urlparse
from platformcode import logger
from decimal import Decimal
from js2py.internals import seval
class Cloudflare:
@@ -47,25 +46,50 @@ class Cloudflare:
logger.debug("Metodo #2 (headers): NO disponible")
self.header_data = {}
def solve_cf(self, body, domain):
k = re.compile('<div style="display:none;visibility:hidden;" id=".*?">(.*?)<\/div>', re.DOTALL).findall(body)
k1 = re.compile('function\(p\){var p = eval\(eval.*?atob.*?return \+\(p\)}\(\)', re.DOTALL).findall(body)
if k1:
body = body.replace(k1[0], k[0])
js = re.search(r"setTimeout\(function\(\){\s+(var "
"s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n", body).group(1)
js = re.search(
r"setTimeout\(function\(\){\s+(var s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n",
body
).group(1)
js = re.sub(r"a\.value = ((.+).toFixed\(10\))?", r"\1", js)
js = re.sub(r'(e\s=\sfunction\(s\)\s{.*?};)', '', js, flags=re.DOTALL|re.MULTILINE)
js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js).replace("t.length", str(len(domain)))
js = js.replace('; 121', '')
reemplazar = re.compile('(?is)function\(p\)\{return eval.*?\+p\+"\)"\)}', re.DOTALL).findall(js)
if reemplazar:
js = js.replace(reemplazar[0],'t.charCodeAt')
js = re.sub(r"[\n\\']", "", js)
js = 'a = {{}}; t = "{}";{}'.format(domain, js)
result = seval.eval_js_vm(js)
jsEnv = """
var t = "{domain}";
var g = String.fromCharCode;
o = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
e = function(s) {{
s += "==".slice(2 - (s.length & 3));
var bm, r = "", r1, r2, i = 0;
for (; i < s.length;) {{
bm = o.indexOf(s.charAt(i++)) << 18 | o.indexOf(s.charAt(i++)) << 12 | (r1 = o.indexOf(s.charAt(i++))) << 6 | (r2 = o.indexOf(s.charAt(i++)));
r += r1 === 64 ? g(bm >> 16 & 255) : r2 === 64 ? g(bm >> 16 & 255, bm >> 8 & 255) : g(bm >> 16 & 255, bm >> 8 & 255, bm & 255);
}}
return r;
}};
function italics (str) {{ return '<i>' + this + '</i>'; }};
var document = {{
getElementById: function () {{
return {{'innerHTML': '{innerHTML}'}};
}}
}};
{js}
"""
innerHTML = re.search('<div(?: [^<>]*)? id="([^<>]*?)">([^<>]*?)<\/div>', body , re.MULTILINE | re.DOTALL)
innerHTML = innerHTML.group(2).replace("'", r"\'") if innerHTML else ""
import js2py
from jsc import jsunc
js = jsunc(jsEnv.format(domain=domain, innerHTML=innerHTML, js=js))
def atob(s):
return base64.b64decode('{}'.format(s)).decode('utf-8')
js2py.disable_pyimport()
context = js2py.EvalJs({'atob': atob})
result = context.eval(js)
return float(result)
@property
def wait_time(self):
+47 -14
View File
@@ -32,7 +32,6 @@ import urlparse
from StringIO import StringIO
from threading import Lock
from core.cloudflare import Cloudflare
from platformcode import config, logger
from platformcode.logger import WebErrorException
@@ -76,21 +75,21 @@ def get_url_headers(url):
return url + "|" + "&".join(["%s=%s" % (h, headers[h]) for h in headers])
def load_cookies():
def load_cookies(alfa_s=False):
cookies_lock.acquire()
if os.path.isfile(ficherocookies):
logger.info("Leyendo fichero cookies")
if not alfa_s: logger.info("Leyendo fichero cookies")
try:
cj.load(ficherocookies, ignore_discard=True)
except:
logger.info("El fichero de cookies existe pero es ilegible, se borra")
if not alfa_s: logger.info("El fichero de cookies existe pero es ilegible, se borra")
os.remove(ficherocookies)
cookies_lock.release()
def save_cookies():
def save_cookies(alfa_s=False):
cookies_lock.acquire()
logger.info("Guardando cookies...")
if not alfa_s: logger.info("Guardando cookies...")
cj.save(ficherocookies, ignore_discard=True)
cookies_lock.release()
@@ -99,7 +98,7 @@ load_cookies()
def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=True, cookies=True, replace_headers=False,
add_referer=False, only_headers=False, bypass_cloudflare=True, count_retries=0, count_retries_tot=5, random_headers=False, ignore_response_code=False, alfa_s=False, proxy=True, proxy_web=False, forced_proxy=None, proxy_retries=1):
add_referer=False, only_headers=False, bypass_cloudflare=True, count_retries=0, count_retries_tot=5, random_headers=False, ignore_response_code=False, alfa_s=False, proxy=True, proxy_web=False, proxy_addr_forced=None,forced_proxy=None, proxy_retries=1):
"""
Abre una url y retorna los datos obtenidos
@@ -174,19 +173,28 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
proxy_CF_addr = ''
proxy_web_name = ''
proxy_log = ''
import proxytools
try:
if (proxy or proxy_web) and (forced_proxy or proxytools.channel_proxy_list(url, forced_proxy=forced_proxy)):
if (proxy or proxy_web) and (forced_proxy or proxy_addr_forced or channel_proxy_list(url, forced_proxy=forced_proxy)):
import proxytools
proxy_addr, proxy_CF_addr, proxy_web_name, proxy_log = proxytools.get_proxy_addr(url, post=post, forced_proxy=forced_proxy)
if proxy_addr_forced and proxy_log:
import scrapertools
proxy_log = scrapertools.find_single_match(str(proxy_addr_forced), "{'http.*':\s*'(.*?)'}")
if proxy and proxy_addr:
if proxy_addr_forced: proxy_addr = proxy_addr_forced
handlers.append(urllib2.ProxyHandler(proxy_addr))
proxy_stat = ', Proxy Direct ' + proxy_log
elif proxy and proxy_CF_addr:
if proxy_addr_forced: proxy_CF_addr = proxy_addr_forced
handlers.append(urllib2.ProxyHandler(proxy_CF_addr))
proxy_stat = ', Proxy CF ' + proxy_log
elif proxy and not proxy_addr and not proxy_CF_addr:
elif proxy and proxy_addr_forced:
proxy_addr = proxy_addr_forced
handlers.append(urllib2.ProxyHandler(proxy_addr))
proxy_stat = ', Proxy Direct ' + proxy_log
elif proxy and not proxy_addr and not proxy_CF_addr and not proxy_addr_forced:
proxy = False
if not proxy_web_name:
proxy_addr, proxy_CF_addr, proxy_web_name, proxy_log = proxytools.get_proxy_addr(url, forced_proxy='Total')
@@ -335,7 +343,7 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
raise WebErrorException(urlparse.urlparse(url)[1])
if cookies:
save_cookies()
save_cookies(alfa_s=alfa_s)
if not alfa_s:
logger.info("Encoding: %s" % (response["headers"].get('content-encoding')))
@@ -362,6 +370,7 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
# Anti Cloudflare
if bypass_cloudflare and count_retries < count_retries_tot:
from core.cloudflare import Cloudflare
cf = Cloudflare(response)
if cf.is_cloudflare:
count_retries += 1
@@ -370,15 +379,15 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
auth_url = cf.get_url()
if not alfa_s:
logger.info("Autorizando... intento %d url: %s" % (count_retries, auth_url))
tt = downloadpage(auth_url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web)
tt = downloadpage(auth_url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web, forced_proxy=forced_proxy, proxy_addr_forced=proxy_addr_forced, alfa_s=alfa_s)
if tt.code == 403:
tt = downloadpage(url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web)
tt = downloadpage(url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web, forced_proxy=forced_proxy, proxy_addr_forced=proxy_addr_forced, alfa_s=alfa_s)
if tt.sucess:
if not alfa_s:
logger.info("Autorización correcta, descargando página")
resp = downloadpage(url=response["url"], post=post, headers=headers, timeout=timeout,
follow_redirects=follow_redirects, count_retries=count_retries,
cookies=cookies, replace_headers=replace_headers, add_referer=add_referer, proxy=proxy, proxy_web=proxy_web, count_retries_tot=count_retries_tot)
cookies=cookies, replace_headers=replace_headers, add_referer=add_referer, proxy=proxy, proxy_web=proxy_web, count_retries_tot=count_retries_tot, forced_proxy=forced_proxy, proxy_addr_forced=proxy_addr_forced, alfa_s=alfa_s)
response["sucess"] = resp.sucess
response["code"] = resp.code
response["error"] = resp.error
@@ -435,6 +444,30 @@ def random_useragent():
return UserAgentIem
return default_headers["User-Agent"]
def channel_proxy_list(url, forced_proxy=None):
import base64
import ast
import scrapertools
try:
proxy_channel_bloqued_str = base64.b64decode(config.get_setting('proxy_channel_bloqued')).decode('utf-8')
proxy_channel_bloqued = dict()
proxy_channel_bloqued = ast.literal_eval(proxy_channel_bloqued_str)
except:
logger.debug('Proxytools no inicializado correctamente')
return False
if not url.endswith('/'):
url += '/'
if scrapertools.find_single_match(url, '(?:http.*:\/\/)?([^\?|\/]+)(?:\?|\/)') in proxy_channel_bloqued:
if forced_proxy:
return True
if 'ON' in proxy_channel_bloqued[scrapertools.find_single_match(url, '(?:http.*:\/\/)?([^\?|\/]+)(?:\?|\/)')]:
return True
return False
class NoRedirectHandler(urllib2.HTTPRedirectHandler):
File diff suppressed because one or more lines are too long
-24
View File
@@ -10,25 +10,6 @@ from core import httptools
from platformcode import logger
def downloadpage(url, post=None, headers=None, follow_redirects=True, timeout=None, header_to_get=None):
response = httptools.downloadpage(url, post=post, headers=headers, follow_redirects=follow_redirects,
timeout=timeout)
if header_to_get:
return response.headers.get(header_to_get)
else:
return response.data
def downloadpageGzip(url):
response = httptools.downloadpage(url, add_referer=True)
return response.data
def getLocationHeaderFromResponse(url):
response = httptools.downloadpage(url, only_headers=True)
return response.headers.get("location")
def get_header_from_response(url, header_to_get="", post=None, headers=None):
header_to_get = header_to_get.lower()
response = httptools.downloadpage(url, post=post, headers=headers, only_headers=True)
@@ -48,11 +29,6 @@ def printMatches(matches):
i = i + 1
def get_match(data, patron, index=0):
matches = re.findall(patron, data, flags=re.DOTALL)
return matches[index]
def find_single_match(data, patron, index=0):
try:
matches = re.findall(patron, data, flags=re.DOTALL)
-4
View File
@@ -18,10 +18,6 @@ def printMatches(matches):
i = i + 1
def get_match(data, patron, index=0):
return find_single_match(data, patron, index=0)
def find_single_match(data, patron, index=0):
try:
matches = re.findall(patron, data, flags=re.DOTALL)