335 lines
13 KiB
Python
335 lines
13 KiB
Python
# -*- coding: utf-8 -*-
|
|
# --------------------------------------------------------------------------------
|
|
# httptools
|
|
# --------------------------------------------------------------------------------
|
|
|
|
|
|
# Fix para error de validación del certificado del tipo:
|
|
# [downloadpage] Response code: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:661)>
|
|
# [downloadpage] Response error: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:661)
|
|
# Fix desde la página: https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error
|
|
#-----------------------------------------------------------------------
|
|
import ssl
|
|
try:
|
|
_create_unverified_https_context = ssl._create_unverified_context
|
|
except AttributeError:
|
|
# Legacy Python that doesn't verify HTTPS certificates by default
|
|
pass
|
|
else:
|
|
# Handle target environment that doesn't support HTTPS verification
|
|
ssl._create_default_https_context = _create_unverified_https_context
|
|
#-----------------------------------------------------------------------
|
|
|
|
|
|
import inspect
|
|
import cookielib
|
|
import gzip
|
|
import os
|
|
import time
|
|
import urllib
|
|
import urllib2
|
|
import urlparse
|
|
from StringIO import StringIO
|
|
from threading import Lock
|
|
|
|
from core.cloudflare import Cloudflare
|
|
from platformcode import config, logger
|
|
from platformcode.logger import WebErrorException
|
|
|
|
## Obtiene la versión del addon
|
|
__version = config.get_addon_version()
|
|
|
|
cookies_lock = Lock()
|
|
|
|
cj = cookielib.MozillaCookieJar()
|
|
ficherocookies = os.path.join(config.get_data_path(), "cookies.dat")
|
|
|
|
# Headers por defecto, si no se especifica nada
|
|
default_headers = dict()
|
|
default_headers["User-Agent"] = "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3163.100 Safari/537.36"
|
|
default_headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
|
|
default_headers["Accept-Language"] = "es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3"
|
|
default_headers["Accept-Charset"] = "UTF-8"
|
|
default_headers["Accept-Encoding"] = "gzip"
|
|
|
|
# Tiempo máximo de espera para downloadpage, si no se especifica nada
|
|
HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT = config.get_setting('httptools_timeout', default=15)
|
|
if HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT == 0: HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT = None
|
|
|
|
# Uso aleatorio de User-Agents, si no se especifica nada
|
|
HTTPTOOLS_DEFAULT_RANDOM_HEADERS = False
|
|
|
|
def get_user_agent():
|
|
# Devuelve el user agent global para ser utilizado cuando es necesario para la url.
|
|
return default_headers["User-Agent"]
|
|
|
|
def get_url_headers(url):
|
|
domain_cookies = cj._cookies.get("." + urlparse.urlparse(url)[1], {}).get("/", {})
|
|
|
|
if "|" in url or not "cf_clearance" in domain_cookies:
|
|
return url
|
|
|
|
headers = dict()
|
|
headers["User-Agent"] = default_headers["User-Agent"]
|
|
headers["Cookie"] = "; ".join(["%s=%s" % (c.name, c.value) for c in domain_cookies.values()])
|
|
|
|
return url + "|" + "&".join(["%s=%s" % (h, headers[h]) for h in headers])
|
|
|
|
|
|
def load_cookies():
|
|
cookies_lock.acquire()
|
|
if os.path.isfile(ficherocookies):
|
|
logger.info("Leyendo fichero cookies")
|
|
try:
|
|
cj.load(ficherocookies, ignore_discard=True)
|
|
except:
|
|
logger.info("El fichero de cookies existe pero es ilegible, se borra")
|
|
os.remove(ficherocookies)
|
|
cookies_lock.release()
|
|
|
|
|
|
def save_cookies():
|
|
cookies_lock.acquire()
|
|
logger.info("Guardando cookies...")
|
|
cj.save(ficherocookies, ignore_discard=True)
|
|
cookies_lock.release()
|
|
|
|
|
|
load_cookies()
|
|
|
|
|
|
def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=True, cookies=True, replace_headers=False,
|
|
add_referer=False, only_headers=False, bypass_cloudflare=True, count_retries=0, random_headers=False):
|
|
"""
|
|
Abre una url y retorna los datos obtenidos
|
|
|
|
@param url: url que abrir.
|
|
@type url: str
|
|
@param post: Si contiene algun valor este es enviado mediante POST.
|
|
@type post: str
|
|
@param headers: Headers para la petición, si no contiene nada se usara los headers por defecto.
|
|
@type headers: dict, list
|
|
@param timeout: Timeout para la petición.
|
|
@type timeout: int
|
|
@param follow_redirects: Indica si se han de seguir las redirecciones.
|
|
@type follow_redirects: bool
|
|
@param cookies: Indica si se han de usar las cookies.
|
|
@type cookies: bool
|
|
@param replace_headers: Si True, los headers pasados por el parametro "headers" sustituiran por completo los headers por defecto.
|
|
Si False, los headers pasados por el parametro "headers" modificaran los headers por defecto.
|
|
@type replace_headers: bool
|
|
@param add_referer: Indica si se ha de añadir el header "Referer" usando el dominio de la url como valor.
|
|
@type add_referer: bool
|
|
@param only_headers: Si True, solo se descargarán los headers, omitiendo el contenido de la url.
|
|
@type only_headers: bool
|
|
@param random_headers: Si True, utiliza el método de seleccionar headers aleatorios.
|
|
@type random_headers: bool
|
|
@return: Resultado de la petición
|
|
@rtype: HTTPResponse
|
|
|
|
Parametro Tipo Descripción
|
|
----------------------------------------------------------------------------------------------------------------
|
|
HTTPResponse.sucess: bool True: Peticion realizada correctamente | False: Error al realizar la petición
|
|
HTTPResponse.code: int Código de respuesta del servidor o código de error en caso de producirse un error
|
|
HTTPResponse.error: str Descripción del error en caso de producirse un error
|
|
HTTPResponse.headers: dict Diccionario con los headers de respuesta del servidor
|
|
HTTPResponse.data: str Respuesta obtenida del servidor
|
|
HTTPResponse.time: float Tiempo empleado para realizar la petición
|
|
|
|
"""
|
|
|
|
response = {}
|
|
|
|
# Headers por defecto, si no se especifica nada
|
|
request_headers = default_headers.copy()
|
|
|
|
# Headers pasados como parametros
|
|
if headers is not None:
|
|
if not replace_headers:
|
|
request_headers.update(dict(headers))
|
|
else:
|
|
request_headers = dict(headers)
|
|
|
|
if add_referer:
|
|
request_headers["Referer"] = "/".join(url.split("/")[:3])
|
|
|
|
if random_headers or HTTPTOOLS_DEFAULT_RANDOM_HEADERS:
|
|
request_headers['User-Agent'] = random_useragent()
|
|
|
|
url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]")
|
|
|
|
# Limitar tiempo de descarga si no se ha pasado timeout y hay un valor establecido en la variable global
|
|
if timeout is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: timeout = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT
|
|
|
|
logger.info("----------------------------------------------")
|
|
logger.info("downloadpage Alfa: %s" %__version)
|
|
logger.info("----------------------------------------------")
|
|
logger.info("Timeout: %s" % timeout)
|
|
logger.info("URL: " + url)
|
|
logger.info("Dominio: " + urlparse.urlparse(url)[1])
|
|
if post:
|
|
logger.info("Peticion: POST")
|
|
else:
|
|
logger.info("Peticion: GET")
|
|
logger.info("Usar Cookies: %s" % cookies)
|
|
logger.info("Descargar Pagina: %s" % (not only_headers))
|
|
logger.info("Fichero de Cookies: " + ficherocookies)
|
|
logger.info("Headers:")
|
|
for header in request_headers:
|
|
logger.info("- %s: %s" % (header, request_headers[header]))
|
|
|
|
# Handlers
|
|
handlers = [urllib2.HTTPHandler(debuglevel=False)]
|
|
|
|
if not follow_redirects:
|
|
handlers.append(NoRedirectHandler())
|
|
|
|
if cookies:
|
|
handlers.append(urllib2.HTTPCookieProcessor(cj))
|
|
|
|
opener = urllib2.build_opener(*handlers)
|
|
|
|
logger.info("Realizando Peticion")
|
|
|
|
# Contador
|
|
inicio = time.time()
|
|
|
|
req = urllib2.Request(url, post, request_headers)
|
|
|
|
try:
|
|
if urllib2.__version__ == "2.4":
|
|
import socket
|
|
deftimeout = socket.getdefaulttimeout()
|
|
if timeout is not None:
|
|
socket.setdefaulttimeout(timeout)
|
|
handle = opener.open(req)
|
|
socket.setdefaulttimeout(deftimeout)
|
|
else:
|
|
handle = opener.open(req, timeout=timeout)
|
|
|
|
except urllib2.HTTPError, handle:
|
|
response["sucess"] = False
|
|
response["code"] = handle.code
|
|
response["error"] = handle.__dict__.get("reason", str(handle))
|
|
response["headers"] = handle.headers.dict
|
|
if not only_headers:
|
|
response["data"] = handle.read()
|
|
else:
|
|
response["data"] = ""
|
|
response["time"] = time.time() - inicio
|
|
response["url"] = handle.geturl()
|
|
|
|
except Exception, e:
|
|
response["sucess"] = False
|
|
response["code"] = e.__dict__.get("errno", e.__dict__.get("code", str(e)))
|
|
response["error"] = e.__dict__.get("reason", str(e))
|
|
response["headers"] = {}
|
|
response["data"] = ""
|
|
response["time"] = time.time() - inicio
|
|
response["url"] = url
|
|
|
|
else:
|
|
response["sucess"] = True
|
|
response["code"] = handle.code
|
|
response["error"] = None
|
|
response["headers"] = handle.headers.dict
|
|
if not only_headers:
|
|
response["data"] = handle.read()
|
|
else:
|
|
response["data"] = ""
|
|
response["time"] = time.time() - inicio
|
|
response["url"] = handle.geturl()
|
|
|
|
logger.info("Terminado en %.2f segundos" % (response["time"]))
|
|
logger.info("Response sucess: %s" % (response["sucess"]))
|
|
logger.info("Response code: %s" % (response["code"]))
|
|
logger.info("Response error: %s" % (response["error"]))
|
|
logger.info("Response data length: %s" % (len(response["data"])))
|
|
logger.info("Response headers:")
|
|
server_cloudflare = ""
|
|
for header in response["headers"]:
|
|
logger.info("- %s: %s" % (header, response["headers"][header]))
|
|
if "cloudflare" in response["headers"][header]:
|
|
server_cloudflare = "cloudflare"
|
|
|
|
is_channel = inspect.getmodule(inspect.currentframe().f_back)
|
|
# error 4xx o 5xx se lanza excepcion (menos para servidores)
|
|
# response["code"] = 400 # linea de código para probar
|
|
is_channel = str(is_channel).replace("/servers/","\\servers\\") # Para sistemas operativos diferente a Windows la ruta cambia
|
|
if type(response["code"]) == int and "\\servers\\" not in str(is_channel):
|
|
if response["code"] > 399 and (server_cloudflare == "cloudflare" and response["code"] != 503):
|
|
raise WebErrorException(urlparse.urlparse(url)[1])
|
|
|
|
if cookies:
|
|
save_cookies()
|
|
|
|
logger.info("Encoding: %s" % (response["headers"].get('content-encoding')))
|
|
|
|
if response["headers"].get('content-encoding') == 'gzip':
|
|
logger.info("Descomprimiendo...")
|
|
try:
|
|
response["data"] = gzip.GzipFile(fileobj=StringIO(response["data"])).read()
|
|
logger.info("Descomprimido")
|
|
except:
|
|
logger.info("No se ha podido descomprimir")
|
|
|
|
# Anti Cloudflare
|
|
if bypass_cloudflare and count_retries < 5:
|
|
cf = Cloudflare(response)
|
|
if cf.is_cloudflare:
|
|
count_retries += 1
|
|
logger.info("cloudflare detectado, esperando %s segundos..." % cf.wait_time)
|
|
auth_url = cf.get_url()
|
|
logger.info("Autorizando... intento %d url: %s" % (count_retries, auth_url))
|
|
if downloadpage(auth_url, headers=request_headers, replace_headers=True, count_retries=count_retries).sucess:
|
|
logger.info("Autorización correcta, descargando página")
|
|
resp = downloadpage(url=response["url"], post=post, headers=headers, timeout=timeout,
|
|
follow_redirects=follow_redirects,
|
|
cookies=cookies, replace_headers=replace_headers, add_referer=add_referer)
|
|
response["sucess"] = resp.sucess
|
|
response["code"] = resp.code
|
|
response["error"] = resp.error
|
|
response["headers"] = resp.headers
|
|
response["data"] = resp.data
|
|
response["time"] = resp.time
|
|
response["url"] = resp.url
|
|
else:
|
|
logger.info("No se ha podido autorizar")
|
|
|
|
return type('HTTPResponse', (), response)
|
|
|
|
|
|
def random_useragent():
|
|
"""
|
|
Based on code from https://github.com/theriley106/RandomHeaders
|
|
|
|
Python Method that generates fake user agents with a locally saved DB (.csv file).
|
|
|
|
This is useful for webscraping, and testing programs that identify devices based on the user agent.
|
|
"""
|
|
|
|
import random
|
|
|
|
UserAgentPath = os.path.join(config.get_runtime_path(), 'tools', 'UserAgent.csv')
|
|
if os.path.exists(UserAgentPath):
|
|
UserAgentIem = random.choice(list(open(UserAgentPath))).strip()
|
|
if UserAgentIem:
|
|
return UserAgentIem
|
|
|
|
return default_headers["User-Agent"]
|
|
|
|
|
|
class NoRedirectHandler(urllib2.HTTPRedirectHandler):
|
|
def http_error_302(self, req, fp, code, msg, headers):
|
|
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
|
|
infourl.status = code
|
|
infourl.code = code
|
|
return infourl
|
|
|
|
http_error_300 = http_error_302
|
|
http_error_301 = http_error_302
|
|
http_error_303 = http_error_302
|
|
http_error_307 = http_error_302
|
|
|
|
|