Fix

2020-01-21 09:34:16 +01:00
parent 7bf337b38b
commit b6a670705f
2 changed files with 25 additions and 7 deletions
--- a/core/scrapertools.py
+++ b/core/scrapertools.py
@@ -39,9 +39,30 @@ def find_multiple_matches_groups(text, pattern):

 # Convierte los codigos html "&ntilde;" y lo reemplaza por "ñ" caracter unicode utf-8
 def decodeHtmlentities(data):
-    import HTMLParser
-    parser = HTMLParser.HTMLParser()
-    return parser.unescape(data)
+    entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8})(;?)")
+
+    def substitute_entity(match):
+        ent = match.group(2) + match.group(3)
+        res = ""
+        while not ent in html5 and not ent.endswith(";") and match.group(1) != "#":
+            # Excepción para cuando '&' se usa como argumento en la urls contenidas en los datos
+            try:
+                res = ent[-1] + res
+                ent = ent[:-1]
+            except:
+                break
+
+        if match.group(1) == "#" and ent.replace(";", "").isdigit():
+            ent = unichr(int(ent.replace(";", "")))
+            return ent.encode('utf-8')
+        else:
+            cp = html5.get(ent)
+            if cp:
+                return cp.decode("unicode-escape").encode('utf-8') + res
+            else:
+                return match.group()
+
+    return entity_re.subn(substitute_entity, data)[0]


 def unescape(text):
--- a/core/support.py
+++ b/core/support.py
@@ -168,10 +168,7 @@ def scrapeLang(scraped, lang, longtitle):
    return language, longtitle

 def cleantitle(title):
-    try:
-        cleantitle = scrapertools.htmlclean(scrapertools.decodeHtmlentities(title).replace('"', "'").replace('×', 'x').replace('–', '-')).strip()
-    except:
-        cleantitle = title
+    cleantitle = scrapertools.htmlclean(scrapertools.decodeHtmlentities(title).replace('"', "'").replace('×', 'x').replace('–', '-')).strip()
    return cleantitle

 def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, typeContentDict, typeActionDict, blacklist, search, pag, function, lang):