diff --git a/core/scrapertools.py b/core/scrapertools.py index aa926c8b..133e5d3b 100644 --- a/core/scrapertools.py +++ b/core/scrapertools.py @@ -104,6 +104,7 @@ def unescape(text): from Fredrik Lundh http://effbot.org/zone/re-sub.htm#unescape-html """ + if not ('&' in text and ';' in text): return text @@ -129,13 +130,16 @@ def unescape(text): import html.entities as htmlentitydefs else: import htmlentitydefs - text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8") + ret = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8") except KeyError: logger.error("keyerror") pass except: pass - return text # leave as is + # from core.support import dbg;dbg() + if type(ret) != str: + ret = ret.decode() + return ret # leave as is return re.sub("&#?\w+;", fixup, str(text)) diff --git a/core/support.py b/core/support.py index 408ec878..7432dd91 100755 --- a/core/support.py +++ b/core/support.py @@ -895,12 +895,13 @@ def match(item_url_string, **args): matches: all the matches ''' - matches = blocks = [] + matches = [] + blocks = [] url = None # arguments allowed for scrape patron = args.get('patron', None) patronBlock = args.get('patronBlock', None) - patronBlocks = args.get('patronBlock', None) + patronBlocks = args.get('patronBlocks', None) debug = args.get('debug', False) debugBlock = args.get('debugBlock', False) string = args.get('string', False) @@ -934,8 +935,9 @@ def match(item_url_string, **args): if patronBlock: blocks = [scrapertools.find_single_match(data, patronBlock)] elif patronBlocks: - if type(patronBlock) == str: patron = [patronBlock] - for p in patronBlock: + if type(patronBlocks) == str: + patronBlocks = [patronBlocks] + for p in patronBlocks: blocks += scrapertools.find_multiple_matches(data, p) else: blocks = [data]