# -*- coding: utf-8 -*- # -------------------------------------------------------------------------------- # Scraper tools for reading and processing web elements # -------------------------------------------------------------------------------- #from future import standard_library #standard_library.install_aliases() #from builtins import str #from builtins import chr import sys PY3 = False if sys.version_info[0] >= 3: PY3 = True; unicode = str; unichr = chr; long = int import re import time # from core import httptools from core.entities import html5 from platformcode import logger # def get_header_from_response(url, header_to_get="", post=None, headers=None): # header_to_get = header_to_get.lower() # response = httptools.downloadpage(url, post=post, headers=headers, only_headers=True) # return response.headers.get(header_to_get) # def read_body_and_headers(url, post=None, headers=None, follow_redirects=False, timeout=None): # response = httptools.downloadpage(url, post=post, headers=headers, follow_redirects=follow_redirects, # timeout=timeout) # return response.data, response.headers def printMatches(matches): i = 0 for match in matches: logger.debug("%d %s" % (i, match)) i = i + 1 def find_single_match(data, patron, index=0): try: if index == 0: matches = re.search(patron, data, flags=re.DOTALL) if matches: if len(matches.groups()) == 1: return matches.group(1) elif len(matches.groups()) > 1: return matches.groups() else: return matches.group() else: return "" else: matches = re.findall(patron, data, flags=re.DOTALL) return matches[index] except: return "" # Parse string and extracts multiple matches using regular expressions def find_multiple_matches(text, pattern): return re.findall(pattern, text, re.DOTALL) def find_multiple_matches_groups(text, pattern): r = re.compile(pattern) return [m.groupdict() for m in r.finditer(text)] # Convert html codes "ñ" and replace it with "ñ" unicode utf-8 character def decodeHtmlentities(data): entity_re = re.compile(r"&(#?)(\d{1,5}|\w{1,8})(;?)") def substitute_entity(match): ent = match.group(2) + match.group(3) res = "" while not ent in html5 and not ent.endswith(";") and match.group(1) != "#": # Exception for when '&' is used as an argument in the urls contained in the data try: res = ent[-1] + res ent = ent[:-1] except: break if match.group(1) == "#" and ent.replace(";", "").isdigit(): ent = unichr(int(ent.replace(";", ""))) return ent if PY3 else ent.encode('utf-8') else: cp = html5.get(ent) if cp: if PY3: return cp + res else: return cp.decode("unicode-escape").encode('utf-8') + res else: return match.group() return entity_re.subn(substitute_entity, data)[0] def unescape(text): """ Removes HTML or XML character references and entities from a text string. keep &, >, < in the source code. from Fredrik Lundh http://effbot.org/zone/re-sub.htm#unescape-html """ if not ('&' in text and ';' in text): return text def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": ret = unichr(int(text[3:-1], 16)) return ret if PY3 else ret.encode("utf-8") else: ret = unichr(int(text[2:-1])) return ret if PY3 else ret.encode("utf-8") except ValueError: logger.error("error de valor") pass else: # named entity try: if PY3: import html.entities as htmlentitydefs else: import htmlentitydefs text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8") except KeyError: logger.error("keyerror") pass except: pass return text # leave as is return re.sub("&#?\w+;", fixup, str(text)) # Convert html codes "ñ" and replace it with "ñ" unicode utf-8 character # def decodeHtmlentities(string): # string = entitiesfix(string) # entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") # def substitute_entity(match): # if PY3: # from html.entities import name2codepoint as n2cp # else: # from htmlentitydefs import name2codepoint as n2cp # ent = match.group(2) # if match.group(1) == "#": # return unichr(int(ent)).encode('utf-8') # else: # cp = n2cp.get(ent) # if cp: # return unichr(cp).encode('utf-8') # else: # return match.group() # return entity_re.subn(substitute_entity, string)[0] # def entitiesfix(string): # # Las entidades comienzan siempre con el símbolo & , y terminan con un punto y coma ( ; ). # string = string.replace("á", "á") # string = string.replace("é", "é") # string = string.replace("í", "í") # string = string.replace("ó", "ó") # string = string.replace("ú", "ú") # string = string.replace("Á", "Á") # string = string.replace("É", "É") # string = string.replace("Í", "Í") # string = string.replace("Ó", "Ó") # string = string.replace("Ú", "Ú") # string = string.replace("ü", "ü") # string = string.replace("Ü", "Ü") # string = string.replace("ñ", "ñ") # string = string.replace("¿", "¿") # string = string.replace("¡", "¡") # string = string.replace(";;", ";") # return string def htmlclean(cadena): cadena = re.compile("", re.DOTALL).sub("", cadena) cadena = cadena.replace("
", "") cadena = cadena.replace("
", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("
  • ", "") cadena = cadena.replace("
  • ", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("
    ", " ") cadena = cadena.replace("
    ", " ") cadena = cadena.replace("
    ", " ") cadena = re.compile("]*>", re.DOTALL).sub(" ", cadena) cadena = re.compile("", re.DOTALL).sub("", cadena) cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("

    ", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("", re.DOTALL).sub("", cadena) cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("\t", "") # cadena = entityunescape(cadena) return cadena def slugify(title): # print title # Substitutes accents and eñes title = title.replace("Á", "a") title = title.replace("É", "e") title = title.replace("Í", "i") title = title.replace("Ó", "o") title = title.replace("Ú", "u") title = title.replace("á", "a") title = title.replace("é", "e") title = title.replace("í", "i") title = title.replace("ó", "o") title = title.replace("ú", "u") title = title.replace("À", "a") title = title.replace("È", "e") title = title.replace("Ì", "i") title = title.replace("Ò", "o") title = title.replace("Ù", "u") title = title.replace("à", "a") title = title.replace("è", "e") title = title.replace("ì", "i") title = title.replace("ò", "o") title = title.replace("ù", "u") title = title.replace("ç", "c") title = title.replace("Ç", "C") title = title.replace("Ñ", "n") title = title.replace("ñ", "n") title = title.replace("/", "-") title = title.replace("&", "&") # Lowercase title = title.lower().strip() # Remove invalid characters validchars = "abcdefghijklmnopqrstuvwxyz1234567890- " title = ''.join(c for c in title if c in validchars) # Replace duplicate blanks and line breaks title = re.compile(r"\s+", re.DOTALL).sub(" ", title) # Replace blanks with hyphens title = re.compile(r"\s", re.DOTALL).sub("-", title.strip()) # Replace duplicate blanks and line breaks title = re.compile(r"\-+", re.DOTALL).sub("-", title) # Fix special cases if title.startswith("-"): title = title[1:] if title == "": title = "-" + str(time.time()) return title def remove_htmltags(string): return re.sub('<[^<]+?>', '', string) def remove_show_from_title(title, show): # print slugify(title)+" == "+slugify(show) # Remove program name from title if slugify(title).startswith(slugify(show)): # Convert to unicode first, or encoding is lost title = unicode(title, "utf-8", "replace") show = unicode(show, "utf-8", "replace") title = title[len(show):].strip() if title.startswith("-"): title = title[1:].strip() if title == "": title = str(time.time()) # Return to utf-8 title = title.encode("utf-8", "ignore") show = show.encode("utf-8", "ignore") return title def get_filename_from_url(url): if PY3: import urllib.parse as urlparse # It is very slow in PY2. In PY3 it is native else: import urlparse # We use the native of PY2 which is faster parsed_url = urlparse.urlparse(url) try: filename = parsed_url.path except: # If it fails it is because the implementation of parsed_url does not recognize the attributes as "path" if len(parsed_url) >= 4: filename = parsed_url[2] else: filename = "" if "/" in filename: filename = filename.split("/")[-1] return filename def get_domain_from_url(url): if PY3: import urllib.parse as urlparse # It is very slow in PY2. In PY3 it is native else: import urlparse # We use the native of PY2 which is faster parsed_url = urlparse.urlparse(url) try: filename = parsed_url.netloc except: # If it fails it is because the implementation of parsed_url does not recognize the attributes as "path" if len(parsed_url) >= 4: filename = parsed_url[1] else: filename = "" return filename def get_season_and_episode(title): """ Returns the season and episode number in "1x01" format obtained from the title of an episode Examples of different values ​​for title and its return value: "serie 101x1.strm", "s101e1.avi", "t101e1.avi" -> '101x01' "Name TvShow 1x6.avi" -> '1x06' "Temp 3 episodio 2.avi" -> '3x02' "Alcantara season 13 episodie 12.avi" -> '13x12' "Temp1 capitulo 14" -> '1x14' "Temporada 1: El origen Episodio 9" -> '' (entre el numero de temporada y los episodios no puede haber otro texto) "Episodio 25: titulo episodio" -> '' (no existe el numero de temporada) "Serie X Temporada 1" -> '' (no existe el numero del episodio) @type title: str @param title: title of a series episode @rtype: str @return: Nseason and episode number in "1x01" format or empty string if not found """ filename = "" patrons = ["(\d+)\s*[x-]\s*(\d+)", "(\d+)\s*×\s*(\d+)", "(?:[Ss]|[Tt])(\d+)\s?(?:[Ee]|Ep\.?)(\d+)", "(?:[Ss]tag|[Ss]eason|[Ss]tagione\w*)\s*(\d+)\s*(?:[Ee]pi|[Ee]pisode|[Ee]pisodio\w*)\s*(\d+)"] for patron in patrons: try: matches = re.compile(patron, re.I).search(title) if matches: filename = str(int(matches.group(1))) + "x" + str(int(matches.group(2))).zfill(2) break except: pass logger.debug("'" + title + "' -> '" + filename + "'") return filename def get_sha1(cadena): try: import hashlib devuelve = hashlib.sha1(cadena).hexdigest() except: import sha import binascii devuelve = binascii.hexlify(sha.new(cadena).digest()) return devuelve def get_md5(cadena): try: import hashlib devuelve = hashlib.md5(cadena).hexdigest() except: import md5 import binascii devuelve = binascii.hexlify(md5.new(cadena).digest()) return devuelve