Mejoras internas

This commit is contained in:
Kingbox
2019-04-03 16:51:53 +02:00
parent 867a14fe23
commit 7eece68c45
12 changed files with 390 additions and 249 deletions
+234 -140
View File
@@ -99,7 +99,7 @@ load_cookies()
def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=True, cookies=True, replace_headers=False,
add_referer=False, only_headers=False, bypass_cloudflare=True, count_retries=0, random_headers=False, ignore_response_code=False, alfa_s=False):
add_referer=False, only_headers=False, bypass_cloudflare=True, count_retries=0, count_retries_tot=5, random_headers=False, ignore_response_code=False, alfa_s=False, proxy=True, proxy_web=False, forced_proxy=None, proxy_retries=1):
"""
Abre una url y retorna los datos obtenidos
@@ -160,169 +160,263 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]")
# Limitar tiempo de descarga si no se ha pasado timeout y hay un valor establecido en la variable global
if timeout is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: timeout = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT
if timeout == 0: timeout = None
#Si la descarga requiere que se haga a través de un servicio Proxy o ProxyWeb, se prepara la url
proxy_retries_counter = 0
url_save = url
post_save = post
while proxy_retries_counter <= proxy_retries:
# Handlers init
handlers = [urllib2.HTTPHandler(debuglevel=False)]
proxy_retries_counter += 1
proxy_stat = ''
proxy_addr = ''
proxy_CF_addr = ''
proxy_web_name = ''
proxy_log = ''
import proxytools
try:
if (proxy or proxy_web) and (forced_proxy or proxytools.channel_proxy_list(url, forced_proxy=forced_proxy)):
proxy_addr, proxy_CF_addr, proxy_web_name, proxy_log = proxytools.get_proxy_addr(url, post=post, forced_proxy=forced_proxy)
if proxy and proxy_addr:
handlers.append(urllib2.ProxyHandler(proxy_addr))
proxy_stat = ', Proxy Direct ' + proxy_log
elif proxy and proxy_CF_addr:
handlers.append(urllib2.ProxyHandler(proxy_CF_addr))
proxy_stat = ', Proxy CF ' + proxy_log
elif proxy and not proxy_addr and not proxy_CF_addr:
proxy = False
if not proxy_web_name:
proxy_addr, proxy_CF_addr, proxy_web_name, proxy_log = proxytools.get_proxy_addr(url, forced_proxy='Total')
if proxy_web_name:
proxy_web = True
else:
proxy_web = False
if proxy_addr:
proxy = True
handlers.append(urllib2.ProxyHandler(proxy_addr))
proxy_stat = ', Proxy Direct ' + proxy_log
if not alfa_s:
logger.info("----------------------------------------------")
logger.info("downloadpage Alfa: %s" %__version)
logger.info("----------------------------------------------")
logger.info("Timeout: %s" % timeout)
logger.info("URL: " + url)
logger.info("Dominio: " + urlparse.urlparse(url)[1])
if post:
logger.info("Peticion: POST")
else:
logger.info("Peticion: GET")
logger.info("Usar Cookies: %s" % cookies)
logger.info("Descargar Pagina: %s" % (not only_headers))
logger.info("Fichero de Cookies: " + ficherocookies)
if proxy_web and proxy_web_name:
if post: proxy_log = '(POST) ' + proxy_log
url, post, headers_proxy, proxy_web_name = proxytools.set_proxy_web(url, proxy_web_name, post=post)
if proxy_web_name:
proxy_stat = ', Proxy Web ' + proxy_log
if headers_proxy:
request_headers.update(dict(headers_proxy))
if proxy_web and not proxy_web_name:
proxy_web = False
proxy_addr, proxy_CF_addr, proxy_web_name, proxy_log = proxytools.get_proxy_addr(url, forced_proxy='Total')
if proxy_CF_addr:
proxy = True
handlers.append(urllib2.ProxyHandler(proxy_CF_addr))
proxy_stat = ', Proxy CF ' + proxy_log
elif proxy_addr:
proxy = True
handlers.append(urllib2.ProxyHandler(proxy_addr))
proxy_stat = ', Proxy Direct ' + proxy_log
except:
import traceback
logger.error(traceback.format_exc())
proxy = ''
proxy_web = ''
proxy_stat = ''
proxy_addr = ''
proxy_CF_addr = ''
proxy_web_name = ''
proxy_log = ''
url = url_save
# Limitar tiempo de descarga si no se ha pasado timeout y hay un valor establecido en la variable global
if timeout is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: timeout = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT
if timeout == 0: timeout = None
if not alfa_s:
logger.info("----------------------------------------------")
logger.info("downloadpage Alfa: %s" %__version)
logger.info("----------------------------------------------")
logger.info("Timeout: %s" % timeout)
logger.info("URL: " + url)
logger.info("Dominio: " + urlparse.urlparse(url)[1])
if post:
logger.info("Peticion: POST" + proxy_stat)
else:
logger.info("Peticion: GET" + proxy_stat)
logger.info("Usar Cookies: %s" % cookies)
logger.info("Descargar Pagina: %s" % (not only_headers))
logger.info("Fichero de Cookies: " + ficherocookies)
logger.info("Headers:")
for header in request_headers:
logger.info("- %s: %s" % (header, request_headers[header]))
# Handlers
handlers = [urllib2.HTTPHandler(debuglevel=False)]
# Handlers
if not follow_redirects:
handlers.append(NoRedirectHandler())
if not follow_redirects:
handlers.append(NoRedirectHandler())
if cookies:
handlers.append(urllib2.HTTPCookieProcessor(cj))
if cookies:
handlers.append(urllib2.HTTPCookieProcessor(cj))
opener = urllib2.build_opener(*handlers)
opener = urllib2.build_opener(*handlers)
if not alfa_s:
logger.info("Realizando Peticion")
# Contador
inicio = time.time()
req = urllib2.Request(url, post, request_headers)
try:
if urllib2.__version__ == "2.4":
import socket
deftimeout = socket.getdefaulttimeout()
if timeout is not None:
socket.setdefaulttimeout(timeout)
handle = opener.open(req)
socket.setdefaulttimeout(deftimeout)
else:
handle = opener.open(req, timeout=timeout)
except urllib2.HTTPError, handle:
response["sucess"] = False
response["code"] = handle.code
response["error"] = handle.__dict__.get("reason", str(handle))
response["headers"] = handle.headers.dict
if not only_headers:
response["data"] = handle.read()
else:
response["data"] = ""
response["time"] = time.time() - inicio
response["url"] = handle.geturl()
except Exception, e:
response["sucess"] = False
response["code"] = e.__dict__.get("errno", e.__dict__.get("code", str(e)))
response["error"] = e.__dict__.get("reason", str(e))
response["headers"] = {}
response["data"] = ""
response["time"] = time.time() - inicio
response["url"] = url
else:
response["sucess"] = True
response["code"] = handle.code
response["error"] = None
response["headers"] = handle.headers.dict
if not only_headers:
response["data"] = handle.read()
else:
response["data"] = ""
response["time"] = time.time() - inicio
response["url"] = handle.geturl()
if not alfa_s:
logger.info("Terminado en %.2f segundos" % (response["time"]))
logger.info("Response sucess: %s" % (response["sucess"]))
logger.info("Response code: %s" % (response["code"]))
logger.info("Response error: %s" % (response["error"]))
logger.info("Response data length: %s" % (len(response["data"])))
logger.info("Response headers:")
server_cloudflare = ""
for header in response["headers"]:
if not alfa_s:
logger.info("- %s: %s" % (header, response["headers"][header]))
if "cloudflare" in response["headers"][header]:
server_cloudflare = "cloudflare"
logger.info("Realizando Peticion")
is_channel = inspect.getmodule(inspect.currentframe().f_back)
# error 4xx o 5xx se lanza excepcion (menos para servidores)
# response["code"] = 400 # linea de código para probar
is_channel = str(is_channel).replace("/servers/","\\servers\\") # Para sistemas operativos diferente a Windows la ruta cambia
if type(response["code"]) == int and "\\servers\\" not in str(is_channel) and not ignore_response_code:
if response["code"] > 399 and (server_cloudflare == "cloudflare" and response["code"] != 503):
raise WebErrorException(urlparse.urlparse(url)[1])
# Contador
inicio = time.time()
if cookies:
save_cookies()
req = urllib2.Request(url, post, request_headers)
if not alfa_s:
logger.info("Encoding: %s" % (response["headers"].get('content-encoding')))
if response["headers"].get('content-encoding') == 'gzip':
if not alfa_s:
logger.info("Descomprimiendo...")
data_alt = response["data"]
try:
response["data"] = gzip.GzipFile(fileobj=StringIO(response["data"])).read()
if urllib2.__version__ == "2.4":
import socket
deftimeout = socket.getdefaulttimeout()
if timeout is not None:
socket.setdefaulttimeout(timeout)
handle = opener.open(req)
socket.setdefaulttimeout(deftimeout)
else:
handle = opener.open(req, timeout=timeout)
except urllib2.HTTPError, handle:
response["sucess"] = False
response["code"] = handle.code
response["error"] = handle.__dict__.get("reason", str(handle))
response["headers"] = handle.headers.dict
if not only_headers:
response["data"] = handle.read()
else:
response["data"] = ""
response["time"] = time.time() - inicio
response["url"] = handle.geturl()
except Exception, e:
response["sucess"] = False
response["code"] = e.__dict__.get("errno", e.__dict__.get("code", str(e)))
response["error"] = e.__dict__.get("reason", str(e))
response["headers"] = {}
response["data"] = ""
response["time"] = time.time() - inicio
response["url"] = url
else:
response["sucess"] = True
response["code"] = handle.code
response["error"] = None
response["headers"] = handle.headers.dict
if not only_headers:
response["data"] = handle.read()
else:
response["data"] = ""
response["time"] = time.time() - inicio
response["url"] = handle.geturl()
if not alfa_s:
logger.info("Terminado en %.2f segundos" % (response["time"]))
logger.info("Response sucess: %s" % (response["sucess"]))
logger.info("Response code: %s" % (response["code"]))
logger.info("Response error: %s" % (response["error"]))
logger.info("Response data length: %s" % (len(response["data"])))
logger.info("Response headers:")
server_cloudflare = ""
for header in response["headers"]:
if not alfa_s:
logger.info("Descomprimido")
except:
logger.info("- %s: %s" % (header, response["headers"][header]))
if "cloudflare" in response["headers"][header]:
server_cloudflare = "cloudflare"
is_channel = inspect.getmodule(inspect.currentframe().f_back)
# error 4xx o 5xx se lanza excepcion (menos para servidores)
# response["code"] = 400 # linea de código para probar
is_channel = str(is_channel).replace("/servers/","\\servers\\") # Para sistemas operativos diferente a Windows la ruta cambia
if type(response["code"]) == int and "\\servers\\" not in str(is_channel) and not ignore_response_code and not proxy_stat:
if response["code"] > 399 and (server_cloudflare == "cloudflare" and response["code"] != 503):
raise WebErrorException(urlparse.urlparse(url)[1])
if cookies:
save_cookies()
if not alfa_s:
logger.info("Encoding: %s" % (response["headers"].get('content-encoding')))
if response["headers"].get('content-encoding') == 'gzip':
if not alfa_s:
logger.info("No se ha podido descomprimir con gzip. Intentando con zlib")
response["data"] = data_alt
logger.info("Descomprimiendo...")
data_alt = response["data"]
try:
import zlib
response["data"] = zlib.decompressobj(16 + zlib.MAX_WBITS).decompress(response["data"])
response["data"] = gzip.GzipFile(fileobj=StringIO(response["data"])).read()
if not alfa_s:
logger.info("Descomprimido")
except:
if not alfa_s:
logger.info("No se ha podido descomprimir con zlib")
logger.info("No se ha podido descomprimir con gzip. Intentando con zlib")
response["data"] = data_alt
try:
import zlib
response["data"] = zlib.decompressobj(16 + zlib.MAX_WBITS).decompress(response["data"])
except:
if not alfa_s:
logger.info("No se ha podido descomprimir con zlib")
response["data"] = data_alt
# Anti Cloudflare
if bypass_cloudflare and count_retries < 5:
cf = Cloudflare(response)
if cf.is_cloudflare:
count_retries += 1
if not alfa_s:
logger.info("cloudflare detectado, esperando %s segundos..." % cf.wait_time)
auth_url = cf.get_url()
if not alfa_s:
logger.info("Autorizando... intento %d url: %s" % (count_retries, auth_url))
if downloadpage(auth_url, headers=request_headers, replace_headers=True, count_retries=count_retries).sucess:
# Anti Cloudflare
if bypass_cloudflare and count_retries < count_retries_tot:
cf = Cloudflare(response)
if cf.is_cloudflare:
count_retries += 1
if not alfa_s:
logger.info("Autorización correcta, descargando página")
resp = downloadpage(url=response["url"], post=post, headers=headers, timeout=timeout,
follow_redirects=follow_redirects,
cookies=cookies, replace_headers=replace_headers, add_referer=add_referer)
response["sucess"] = resp.sucess
response["code"] = resp.code
response["error"] = resp.error
response["headers"] = resp.headers
response["data"] = resp.data
response["time"] = resp.time
response["url"] = resp.url
logger.info("cloudflare detectado, esperando %s segundos..." % cf.wait_time)
auth_url = cf.get_url()
if not alfa_s:
logger.info("Autorizando... intento %d url: %s" % (count_retries, auth_url))
tt = downloadpage(auth_url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web)
if tt.code == 403:
tt = downloadpage(url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web)
if tt.sucess:
if not alfa_s:
logger.info("Autorización correcta, descargando página")
resp = downloadpage(url=response["url"], post=post, headers=headers, timeout=timeout,
follow_redirects=follow_redirects, count_retries=count_retries,
cookies=cookies, replace_headers=replace_headers, add_referer=add_referer, proxy=proxy, proxy_web=proxy_web, count_retries_tot=count_retries_tot)
response["sucess"] = resp.sucess
response["code"] = resp.code
response["error"] = resp.error
response["headers"] = resp.headers
response["data"] = resp.data
response["time"] = resp.time
response["url"] = resp.url
else:
if not alfa_s:
logger.info("No se ha podido autorizar")
# Si hay errores usando un Proxy, se refrescan el Proxy y se reintenta el número de veces indicado en proxy_retries
try:
if ', Proxy Web' in proxy_stat:
response["data"] = proxytools.restore_after_proxy_web(response["data"], proxy_web_name, url_save)
if response["data"] == 'ERROR':
response['sucess'] = False
if proxy_stat and response['sucess'] == False and proxy_retries_counter <= proxy_retries and count_retries_tot > 1:
if ', Proxy Direct' in proxy_stat:
proxytools.get_proxy_list_method(proxy_init='ProxyDirect')
elif ', Proxy CF' in proxy_stat:
proxytools.get_proxy_list_method(proxy_init='ProxyCF')
url = url_save
elif ', Proxy Web' in proxy_stat:
proxytools.get_proxy_list_method(proxy_init='ProxyWeb')
url = url_save
post = post_save
else:
if not alfa_s:
logger.info("No se ha podido autorizar")
break
except:
import traceback
logger.error(traceback.format_exc())
break
return type('HTTPResponse', (), response)
def random_useragent():
"""
Based on code from https://github.com/theriley106/RandomHeaders