This commit is contained in:
marco
2019-12-27 23:03:21 +01:00
parent e1c3900039
commit 0692132a3d
47 changed files with 218 additions and 3296 deletions
+76 -82
View File
@@ -1,27 +1,17 @@
# -*- coding: utf-8 -*-
# --------------------------------------------------------------------------------
# Scraper tools for reading and processing web elements
# Scraper tools v2 for reading and processing web elements
# --------------------------------------------------------------------------------
import re
import time
from core import httptools
import urlparse
from core.entities import html5
from platformcode import logger
def get_header_from_response(url, header_to_get="", post=None, headers=None):
header_to_get = header_to_get.lower()
response = httptools.downloadpage(url, post=post, headers=headers, only_headers=True)
return response.headers.get(header_to_get)
def read_body_and_headers(url, post=None, headers=None, follow_redirects=False, timeout=None):
response = httptools.downloadpage(url, post=post, headers=headers, follow_redirects=follow_redirects,
timeout=timeout)
return response.data, response.headers
def printMatches(matches):
i = 0
for match in matches:
@@ -42,8 +32,37 @@ def find_multiple_matches(text, pattern):
return re.findall(pattern, text, re.DOTALL)
def entityunescape(cadena):
return unescape(cadena)
def find_multiple_matches_groups(text, pattern):
r = re.compile(pattern)
return [m.groupdict() for m in r.finditer(text)]
# Convierte los codigos html "ñ" y lo reemplaza por "ñ" caracter unicode utf-8
def decodeHtmlentities(data):
entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8})(;?)")
def substitute_entity(match):
ent = match.group(2) + match.group(3)
res = ""
while not ent in html5 and not ent.endswith(";") and match.group(1) != "#":
# Excepción para cuando '&' se usa como argumento en la urls contenidas en los datos
try:
res = ent[-1] + res
ent = ent[:-1]
except:
break
if match.group(1) == "#":
ent = unichr(int(ent.replace(";", "")))
return ent.encode('utf-8')
else:
cp = html5.get(ent)
if cp:
return cp.decode("unicode-escape").encode('utf-8') + res
else:
return match.group()
return entity_re.subn(substitute_entity, data)[0]
def unescape(text):
@@ -84,47 +103,6 @@ def unescape(text):
# Convierte los codigos html "ñ" y lo reemplaza por "ñ" caracter unicode utf-8
def decodeHtmlentities(string):
string = entitiesfix(string)
entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
def substitute_entity(match):
from htmlentitydefs import name2codepoint as n2cp
ent = match.group(2)
if match.group(1) == "#":
return unichr(int(ent)).encode('utf-8')
else:
cp = n2cp.get(ent)
if cp:
return unichr(cp).encode('utf-8')
else:
return match.group()
return entity_re.subn(substitute_entity, string)[0]
def entitiesfix(string):
# Las entidades comienzan siempre con el símbolo & , y terminan con un punto y coma ( ; ).
string = string.replace("&aacute", "á")
string = string.replace("&eacute", "é")
string = string.replace("&iacute", "í")
string = string.replace("&oacute", "ó")
string = string.replace("&uacute", "ú")
string = string.replace("&Aacute", "Á")
string = string.replace("&Eacute", "É")
string = string.replace("&Iacute", "Í")
string = string.replace("&Oacute", "Ó")
string = string.replace("&Uacute", "Ú")
string = string.replace("&uuml", "ü")
string = string.replace("&Uuml", "Ü")
string = string.replace("&ntilde", "ñ")
string = string.replace("&#191", "¿")
string = string.replace("&#161", "¡")
string = string.replace(";;", ";")
return string
def htmlclean(cadena):
cadena = re.compile("<!--.*?-->", re.DOTALL).sub("", cadena)
@@ -226,7 +204,7 @@ def htmlclean(cadena):
cadena = re.compile("<link[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("\t", "")
cadena = entityunescape(cadena)
# cadena = entityunescape(cadena)
return cadena
@@ -314,8 +292,8 @@ def remove_show_from_title(title, show):
return title
# scrapertools.get_filename_from_url(media_url)[-4:]
def get_filename_from_url(url):
import urlparse
parsed_url = urlparse.urlparse(url)
try:
filename = parsed_url.path
@@ -332,19 +310,18 @@ def get_filename_from_url(url):
return filename
# def get_domain_from_url(url):
# import urlparse
# parsed_url = urlparse.urlparse(url)
# try:
# filename = parsed_url.netloc
# except:
# # Si falla es porque la implementación de parsed_url no reconoce los atributos como "path"
# if len(parsed_url) >= 4:
# filename = parsed_url[1]
# else:
# filename = ""
#
# return filename
def get_domain_from_url(url):
parsed_url = urlparse.urlparse(url)
try:
filename = parsed_url.netloc
except:
# Si falla es porque la implementación de parsed_url no reconoce los atributos como "path"
if len(parsed_url) >= 4:
filename = parsed_url[1]
else:
filename = ""
return filename
def get_season_and_episode(title):
@@ -365,22 +342,15 @@ def get_season_and_episode(title):
@return: Numero de temporada y episodio en formato "1x01" o cadena vacia si no se han encontrado
"""
filename = ""
# 4l3x87 - fix for series example 9-1-1
# original_title = title
# title = title.replace('9-1-1','')
patrons = ["(\d+)\s*[x-]\s*(\d+)", "(\d+)\s*×\s*(\d+)", "(?:s|t)(\d+)e(\d+)",
"(?:season|temp|stagione\w*)\s*(\d+)\s*(?:capitulo|epi|episode|episodio\w*)\s*(\d+)"]
patrons = ["(\d+)x(\d+)", "(?:s|t)(\d+)e(\d+)",
"(?:season|temp\w*)\s*(\d+)\s*(?:capitulo|epi\w*)\s*(\d+)"]
for patron in patrons:
try:
matches = re.compile(patron, re.I).search(title)
if matches:
if len(matches.group(1)) == 1:
filename = matches.group(1) + "x" + matches.group(2).zfill(2)
else:
filename = matches.group(1).lstrip('0') + "x" + matches.group(2).zfill(2)
filename = matches.group(1) + "x" + matches.group(2).zfill(2)
break
except:
pass
@@ -388,3 +358,27 @@ def get_season_and_episode(title):
logger.info("'" + title + "' -> '" + filename + "'")
return filename
def get_sha1(cadena):
try:
import hashlib
devuelve = hashlib.sha1(cadena).hexdigest()
except:
import sha
import binascii
devuelve = binascii.hexlify(sha.new(cadena).digest())
return devuelve
def get_md5(cadena):
try:
import hashlib
devuelve = hashlib.md5(cadena).hexdigest()
except:
import md5
import binascii
devuelve = binascii.hexlify(md5.new(cadena).digest())
return devuelve