From b6a670705f11d38e88f641b70b385852c656210e Mon Sep 17 00:00:00 2001 From: Alhaziel Date: Tue, 21 Jan 2020 09:34:16 +0100 Subject: [PATCH] Fix --- core/scrapertools.py | 27 ++++++++++++++++++++++++--- core/support.py | 5 +---- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/core/scrapertools.py b/core/scrapertools.py index 5b6d4bac..d303593d 100644 --- a/core/scrapertools.py +++ b/core/scrapertools.py @@ -39,9 +39,30 @@ def find_multiple_matches_groups(text, pattern): # Convierte los codigos html "ñ" y lo reemplaza por "ñ" caracter unicode utf-8 def decodeHtmlentities(data): - import HTMLParser - parser = HTMLParser.HTMLParser() - return parser.unescape(data) + entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8})(;?)") + + def substitute_entity(match): + ent = match.group(2) + match.group(3) + res = "" + while not ent in html5 and not ent.endswith(";") and match.group(1) != "#": + # Excepción para cuando '&' se usa como argumento en la urls contenidas en los datos + try: + res = ent[-1] + res + ent = ent[:-1] + except: + break + + if match.group(1) == "#" and ent.replace(";", "").isdigit(): + ent = unichr(int(ent.replace(";", ""))) + return ent.encode('utf-8') + else: + cp = html5.get(ent) + if cp: + return cp.decode("unicode-escape").encode('utf-8') + res + else: + return match.group() + + return entity_re.subn(substitute_entity, data)[0] def unescape(text): diff --git a/core/support.py b/core/support.py index 26caf1b8..7e1246ee 100755 --- a/core/support.py +++ b/core/support.py @@ -168,10 +168,7 @@ def scrapeLang(scraped, lang, longtitle): return language, longtitle def cleantitle(title): - try: - cleantitle = scrapertools.htmlclean(scrapertools.decodeHtmlentities(title).replace('"', "'").replace('×', 'x').replace('–', '-')).strip() - except: - cleantitle = title + cleantitle = scrapertools.htmlclean(scrapertools.decodeHtmlentities(title).replace('"', "'").replace('×', 'x').replace('–', '-')).strip() return cleantitle def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, typeContentDict, typeActionDict, blacklist, search, pag, function, lang):