Files
addon/plugin.video.alfa/core/httptools.py
2018-10-17 20:46:27 +02:00

339 lines
14 KiB
Python

# -*- coding: utf-8 -*-
# --------------------------------------------------------------------------------
# httptools
# --------------------------------------------------------------------------------
# Fix para error de validación del certificado del tipo:
# [downloadpage] Response code: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:661)>
# [downloadpage] Response error: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:661)
# Fix desde la página: https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error
#-----------------------------------------------------------------------
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
# Legacy Python that doesn't verify HTTPS certificates by default
pass
else:
# Handle target environment that doesn't support HTTPS verification
ssl._create_default_https_context = _create_unverified_https_context
#-----------------------------------------------------------------------
import inspect
import cookielib
import gzip
import os
import time
import urllib
import urllib2
import urlparse
from StringIO import StringIO
from threading import Lock
from core.cloudflare import Cloudflare
from platformcode import config, logger
from platformcode.logger import WebErrorException
## Obtiene la versión del addon
__version = config.get_addon_version()
cookies_lock = Lock()
cj = cookielib.MozillaCookieJar()
ficherocookies = os.path.join(config.get_data_path(), "cookies.dat")
# Headers por defecto, si no se especifica nada
default_headers = dict()
default_headers["User-Agent"] = "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3163.100 Safari/537.36"
default_headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
default_headers["Accept-Language"] = "es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3"
default_headers["Accept-Charset"] = "UTF-8"
default_headers["Accept-Encoding"] = "gzip"
# Tiempo máximo de espera para downloadpage, si no se especifica nada
HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT = config.get_setting('httptools_timeout', default=15)
if HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT == 0: HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT = None
# Uso aleatorio de User-Agents, si no se especifica nada
HTTPTOOLS_DEFAULT_RANDOM_HEADERS = False
def get_user_agent():
# Devuelve el user agent global para ser utilizado cuando es necesario para la url.
return default_headers["User-Agent"]
def get_url_headers(url):
domain_cookies = cj._cookies.get("." + urlparse.urlparse(url)[1], {}).get("/", {})
if "|" in url or not "cf_clearance" in domain_cookies:
return url
headers = dict()
headers["User-Agent"] = default_headers["User-Agent"]
headers["Cookie"] = "; ".join(["%s=%s" % (c.name, c.value) for c in domain_cookies.values()])
return url + "|" + "&".join(["%s=%s" % (h, headers[h]) for h in headers])
def load_cookies():
cookies_lock.acquire()
if os.path.isfile(ficherocookies):
logger.info("Leyendo fichero cookies")
try:
cj.load(ficherocookies, ignore_discard=True)
except:
logger.info("El fichero de cookies existe pero es ilegible, se borra")
os.remove(ficherocookies)
cookies_lock.release()
def save_cookies():
cookies_lock.acquire()
logger.info("Guardando cookies...")
cj.save(ficherocookies, ignore_discard=True)
cookies_lock.release()
load_cookies()
def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=True, cookies=True, replace_headers=False,
add_referer=False, only_headers=False, bypass_cloudflare=True, count_retries=0, random_headers=False):
"""
Abre una url y retorna los datos obtenidos
@param url: url que abrir.
@type url: str
@param post: Si contiene algun valor este es enviado mediante POST.
@type post: str
@param headers: Headers para la petición, si no contiene nada se usara los headers por defecto.
@type headers: dict, list
@param timeout: Timeout para la petición.
@type timeout: int
@param follow_redirects: Indica si se han de seguir las redirecciones.
@type follow_redirects: bool
@param cookies: Indica si se han de usar las cookies.
@type cookies: bool
@param replace_headers: Si True, los headers pasados por el parametro "headers" sustituiran por completo los headers por defecto.
Si False, los headers pasados por el parametro "headers" modificaran los headers por defecto.
@type replace_headers: bool
@param add_referer: Indica si se ha de añadir el header "Referer" usando el dominio de la url como valor.
@type add_referer: bool
@param only_headers: Si True, solo se descargarán los headers, omitiendo el contenido de la url.
@type only_headers: bool
@param random_headers: Si True, utiliza el método de seleccionar headers aleatorios.
@type random_headers: bool
@return: Resultado de la petición
@rtype: HTTPResponse
Parametro Tipo Descripción
----------------------------------------------------------------------------------------------------------------
HTTPResponse.sucess: bool True: Peticion realizada correctamente | False: Error al realizar la petición
HTTPResponse.code: int Código de respuesta del servidor o código de error en caso de producirse un error
HTTPResponse.error: str Descripción del error en caso de producirse un error
HTTPResponse.headers: dict Diccionario con los headers de respuesta del servidor
HTTPResponse.data: str Respuesta obtenida del servidor
HTTPResponse.time: float Tiempo empleado para realizar la petición
"""
response = {}
# Headers por defecto, si no se especifica nada
request_headers = default_headers.copy()
# Headers pasados como parametros
if headers is not None:
if not replace_headers:
request_headers.update(dict(headers))
else:
request_headers = dict(headers)
if add_referer:
request_headers["Referer"] = "/".join(url.split("/")[:3])
if random_headers or HTTPTOOLS_DEFAULT_RANDOM_HEADERS:
request_headers['User-Agent'] = random_useragent()
url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]")
# Limitar tiempo de descarga si no se ha pasado timeout y hay un valor establecido en la variable global
if timeout is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: timeout = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT
logger.info("----------------------------------------------")
logger.info("downloadpage Alfa: %s" %__version)
logger.info("----------------------------------------------")
logger.info("Timeout: %s" % timeout)
logger.info("URL: " + url)
logger.info("Dominio: " + urlparse.urlparse(url)[1])
if post:
logger.info("Peticion: POST")
else:
logger.info("Peticion: GET")
logger.info("Usar Cookies: %s" % cookies)
logger.info("Descargar Pagina: %s" % (not only_headers))
logger.info("Fichero de Cookies: " + ficherocookies)
logger.info("Headers:")
for header in request_headers:
logger.info("- %s: %s" % (header, request_headers[header]))
# Handlers
handlers = [urllib2.HTTPHandler(debuglevel=False)]
if not follow_redirects:
handlers.append(NoRedirectHandler())
if cookies:
handlers.append(urllib2.HTTPCookieProcessor(cj))
opener = urllib2.build_opener(*handlers)
logger.info("Realizando Peticion")
# Contador
inicio = time.time()
req = urllib2.Request(url, post, request_headers)
try:
if urllib2.__version__ == "2.4":
import socket
deftimeout = socket.getdefaulttimeout()
if timeout is not None:
socket.setdefaulttimeout(timeout)
handle = opener.open(req)
socket.setdefaulttimeout(deftimeout)
else:
handle = opener.open(req, timeout=timeout)
except urllib2.HTTPError, handle:
response["sucess"] = False
response["code"] = handle.code
response["error"] = handle.__dict__.get("reason", str(handle))
response["headers"] = handle.headers.dict
if not only_headers:
response["data"] = handle.read()
else:
response["data"] = ""
response["time"] = time.time() - inicio
response["url"] = handle.geturl()
except Exception, e:
response["sucess"] = False
response["code"] = e.__dict__.get("errno", e.__dict__.get("code", str(e)))
response["error"] = e.__dict__.get("reason", str(e))
response["headers"] = {}
response["data"] = ""
response["time"] = time.time() - inicio
response["url"] = url
else:
response["sucess"] = True
response["code"] = handle.code
response["error"] = None
response["headers"] = handle.headers.dict
if not only_headers:
response["data"] = handle.read()
else:
response["data"] = ""
response["time"] = time.time() - inicio
response["url"] = handle.geturl()
logger.info("Terminado en %.2f segundos" % (response["time"]))
logger.info("Response sucess: %s" % (response["sucess"]))
logger.info("Response code: %s" % (response["code"]))
logger.info("Response error: %s" % (response["error"]))
logger.info("Response data length: %s" % (len(response["data"])))
logger.info("Response headers:")
server_cloudflare = ""
for header in response["headers"]:
logger.info("- %s: %s" % (header, response["headers"][header]))
if "cloudflare" in response["headers"][header]:
server_cloudflare = "cloudflare"
is_channel = inspect.getmodule(inspect.currentframe().f_back)
# error 4xx o 5xx se lanza excepcion (menos para servidores)
# response["code"] = 400 # linea de código para probar
is_channel = str(is_channel).replace("/servers/","\\servers\\") # Para sistemas operativos diferente a Windows la ruta cambia
if type(response["code"]) == int and "\\servers\\" not in str(is_channel):
if response["code"] > 399 and (server_cloudflare == "cloudflare" and response["code"] != 503):
raise WebErrorException(urlparse.urlparse(url)[1])
if cookies:
save_cookies()
logger.info("Encoding: %s" % (response["headers"].get('content-encoding')))
if response["headers"].get('content-encoding') == 'gzip':
logger.info("Descomprimiendo...")
try:
response["data"] = gzip.GzipFile(fileobj=StringIO(response["data"])).read()
logger.info("Descomprimido")
except:
logger.info("No se ha podido descomprimir")
# Anti Cloudflare
if bypass_cloudflare and count_retries < 5:
cf = Cloudflare(response)
if cf.is_cloudflare:
count_retries += 1
logger.info("cloudflare detectado, esperando %s segundos..." % cf.wait_time)
auth_url = cf.get_url()
logger.info("Autorizando... intento %d url: %s" % (count_retries, auth_url))
if downloadpage(auth_url, headers=request_headers, replace_headers=True, count_retries=count_retries).sucess:
logger.info("Autorización correcta, descargando página")
resp = downloadpage(url=response["url"], post=post, headers=headers, timeout=timeout,
follow_redirects=follow_redirects,
cookies=cookies, replace_headers=replace_headers, add_referer=add_referer)
response["sucess"] = resp.sucess
response["code"] = resp.code
response["error"] = resp.error
response["headers"] = resp.headers
response["data"] = resp.data
response["time"] = resp.time
response["url"] = resp.url
else:
logger.info("No se ha podido autorizar")
return type('HTTPResponse', (), response)
def random_useragent():
"""
Based on code from https://github.com/theriley106/RandomHeaders
Python Method that generates fake user agents with a locally saved DB (.csv file).
This is useful for webscraping, and testing programs that identify devices based on the user agent.
"""
import random
import csv
UserAgentPath = os.path.join(config.get_runtime_path(), 'tools', 'UserAgent.csv')
if os.path.exists(UserAgentPath):
UserAgentCSV = open(UserAgentPath, 'r')
UserAgentList = csv.reader(UserAgentCSV)
UserAgentList = [row for row in UserAgentList]
UserAgentList = [l[0] for l in UserAgentList]
random.shuffle(UserAgentList)
return random.choice(UserAgentList).strip()
return default_headers["User-Agent"]
class NoRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_302(self, req, fp, code, msg, headers):
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
infourl.status = code
infourl.code = code
return infourl
http_error_300 = http_error_302
http_error_301 = http_error_302
http_error_303 = http_error_302
http_error_307 = http_error_302