From d4f83a97bb4eef0cb9893fad543f41b30dd52662 Mon Sep 17 00:00:00 2001 From: marco Date: Mon, 20 Jan 2020 19:30:55 +0100 Subject: [PATCH] fix cb01 -> sezione film (causa oscar..) --- channels/cineblog01.py | 2 +- core/scrapertools.py | 27 +++------------------------ 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/channels/cineblog01.py b/channels/cineblog01.py index 64a82ac5..82889be8 100644 --- a/channels/cineblog01.py +++ b/channels/cineblog01.py @@ -131,7 +131,7 @@ def peliculas(item): # esclusione degli articoli 'di servizio' blacklist = ['BENVENUTI', 'Richieste Serie TV', 'CB01.UNO ▶ TROVA L’INDIRIZZO UFFICIALE ', 'Aggiornamento Quotidiano Serie TV', 'OSCAR 2019 ▶ CB01.UNO: Vota il tuo film preferito! 🎬', - 'Openload: la situazione. Benvenuto Verystream', 'Openload: lo volete ancora?'] + 'Openload: la situazione. Benvenuto Verystream', 'Openload: lo volete ancora?', 'OSCAR 2020 ▶ VOTA IL TUO FILM PREFERITO! 🎬'] # debug = True if 'newest' in item.args: if '/serietv/' not in item.url: diff --git a/core/scrapertools.py b/core/scrapertools.py index 5bb50a7b..5b6d4bac 100644 --- a/core/scrapertools.py +++ b/core/scrapertools.py @@ -39,30 +39,9 @@ def find_multiple_matches_groups(text, pattern): # Convierte los codigos html "ñ" y lo reemplaza por "ñ" caracter unicode utf-8 def decodeHtmlentities(data): - entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8})(;?)") - - def substitute_entity(match): - ent = match.group(2) + match.group(3) - res = "" - while not ent in html5 and not ent.endswith(";") and match.group(1) != "#": - # Excepción para cuando '&' se usa como argumento en la urls contenidas en los datos - try: - res = ent[-1] + res - ent = ent[:-1] - except: - break - - if match.group(1) == "#": - ent = unichr(int(ent.replace(";", ""))) - return ent.encode('utf-8') - else: - cp = html5.get(ent) - if cp: - return cp.decode("unicode-escape").encode('utf-8') + res - else: - return match.group() - - return entity_re.subn(substitute_entity, data)[0] + import HTMLParser + parser = HTMLParser.HTMLParser() + return parser.unescape(data) def unescape(text):