# -*- coding: utf-8 -*- # -------------------------------------------------------------------------------- # Scraper tools for reading and processing web elements # -------------------------------------------------------------------------------- import re import time from core import httptools from platformcode import logger def cache_page(url, post=None, headers=None, modo_cache=None, timeout=None): return cachePage(url, post, headers, modo_cache, timeout=timeout) def cachePage(url, post=None, headers=None, modoCache=None, timeout=None): data = downloadpage(url, post=post, headers=headers, timeout=timeout) return data def downloadpage(url, post=None, headers=None, follow_redirects=True, timeout=None, header_to_get=None): response = httptools.downloadpage(url, post=post, headers=headers, follow_redirects=follow_redirects, timeout=timeout) if header_to_get: return response.headers.get(header_to_get) else: return response.data def downloadpageGzip(url): response = httptools.downloadpage(url, add_referer=True) return response.data def getLocationHeaderFromResponse(url): response = httptools.downloadpage(url, only_headers=True) return response.headers.get("location") def get_header_from_response(url, header_to_get="", post=None, headers=None): header_to_get = header_to_get.lower() response = httptools.downloadpage(url, post=post, headers=headers, only_headers=True) return response.headers.get(header_to_get) def read_body_and_headers(url, post=None, headers=None, follow_redirects=False, timeout=None): response = httptools.downloadpage(url, post=post, headers=headers, follow_redirects=follow_redirects, timeout=timeout) return response.data, response.headers def printMatches(matches): i = 0 for match in matches: logger.info("%d %s" % (i, match)) i = i + 1 def get_match(data, patron, index=0): matches = re.findall(patron, data, flags=re.DOTALL) return matches[index] def find_single_match(data, patron, index=0): try: matches = re.findall(patron, data, flags=re.DOTALL) return matches[index] except: return "" # Parse string and extracts multiple matches using regular expressions def find_multiple_matches(text, pattern): return re.findall(pattern, text, re.DOTALL) def entityunescape(cadena): return unescape(cadena) def unescape(text): """Removes HTML or XML character references and entities from a text string. keep &, >, < in the source code. from Fredrik Lundh http://effbot.org/zone/re-sub.htm#unescape-html """ def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)).encode("utf-8") else: return unichr(int(text[2:-1])).encode("utf-8") except ValueError: logger.error("error de valor") pass else: # named entity try: import htmlentitydefs text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8") except KeyError: logger.error("keyerror") pass except: pass return text # leave as is return re.sub("&#?\w+;", fixup, text) # Convierte los codigos html "ñ" y lo reemplaza por "ñ" caracter unicode utf-8 def decodeHtmlentities(string): string = entitiesfix(string) entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") def substitute_entity(match): from htmlentitydefs import name2codepoint as n2cp ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)).encode('utf-8') else: cp = n2cp.get(ent) if cp: return unichr(cp).encode('utf-8') else: return match.group() return entity_re.subn(substitute_entity, string)[0] def entitiesfix(string): # Las entidades comienzan siempre con el símbolo & , y terminan con un punto y coma ( ; ). string = string.replace("á", "á") string = string.replace("é", "é") string = string.replace("í", "í") string = string.replace("ó", "ó") string = string.replace("ú", "ú") string = string.replace("Á", "Á") string = string.replace("É", "É") string = string.replace("Í", "Í") string = string.replace("Ó", "Ó") string = string.replace("Ú", "Ú") string = string.replace("ü", "ü") string = string.replace("Ü", "Ü") string = string.replace("ñ", "ñ") string = string.replace("¿", "¿") string = string.replace("¡", "¡") string = string.replace(";;", ";") return string def htmlclean(cadena): cadena = re.compile("", re.DOTALL).sub("", cadena) cadena = cadena.replace("
", "") cadena = cadena.replace("
", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("
  • ", "") cadena = cadena.replace("
  • ", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("
    ", " ") cadena = cadena.replace("
    ", " ") cadena = cadena.replace("
    ", " ") cadena = re.compile("]*>", re.DOTALL).sub(" ", cadena) cadena = re.compile("", re.DOTALL).sub("", cadena) cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("

    ", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("", re.DOTALL).sub("", cadena) cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("\t", "") cadena = entityunescape(cadena) return cadena def slugify(title): # print title # Sustituye acentos y eñes title = title.replace("Á", "a") title = title.replace("É", "e") title = title.replace("Í", "i") title = title.replace("Ó", "o") title = title.replace("Ú", "u") title = title.replace("á", "a") title = title.replace("é", "e") title = title.replace("í", "i") title = title.replace("ó", "o") title = title.replace("ú", "u") title = title.replace("À", "a") title = title.replace("È", "e") title = title.replace("Ì", "i") title = title.replace("Ò", "o") title = title.replace("Ù", "u") title = title.replace("à", "a") title = title.replace("è", "e") title = title.replace("ì", "i") title = title.replace("ò", "o") title = title.replace("ù", "u") title = title.replace("ç", "c") title = title.replace("Ç", "C") title = title.replace("Ñ", "n") title = title.replace("ñ", "n") title = title.replace("/", "-") title = title.replace("&", "&") # Pasa a minúsculas title = title.lower().strip() # Elimina caracteres no válidos validchars = "abcdefghijklmnopqrstuvwxyz1234567890- " title = ''.join(c for c in title if c in validchars) # Sustituye espacios en blanco duplicados y saltos de línea title = re.compile("\s+", re.DOTALL).sub(" ", title) # Sustituye espacios en blanco por guiones title = re.compile("\s", re.DOTALL).sub("-", title.strip()) # Sustituye espacios en blanco duplicados y saltos de línea title = re.compile("\-+", re.DOTALL).sub("-", title) # Arregla casos especiales if title.startswith("-"): title = title[1:] if title == "": title = "-" + str(time.time()) return title def remove_htmltags(string): return re.sub('<[^<]+?>', '', string) def remove_show_from_title(title, show): # print slugify(title)+" == "+slugify(show) # Quita el nombre del programa del título if slugify(title).startswith(slugify(show)): # Convierte a unicode primero, o el encoding se pierde title = unicode(title, "utf-8", "replace") show = unicode(show, "utf-8", "replace") title = title[len(show):].strip() if title.startswith("-"): title = title[1:].strip() if title == "": title = str(time.time()) # Vuelve a utf-8 title = title.encode("utf-8", "ignore") show = show.encode("utf-8", "ignore") return title def get_filename_from_url(url): import urlparse parsed_url = urlparse.urlparse(url) try: filename = parsed_url.path except: # Si falla es porque la implementación de parsed_url no reconoce los atributos como "path" if len(parsed_url) >= 4: filename = parsed_url[2] else: filename = "" if "/" in filename: filename = filename.split("/")[-1] return filename # def get_domain_from_url(url): # import urlparse # parsed_url = urlparse.urlparse(url) # try: # filename = parsed_url.netloc # except: # # Si falla es porque la implementación de parsed_url no reconoce los atributos como "path" # if len(parsed_url) >= 4: # filename = parsed_url[1] # else: # filename = "" # # return filename def get_season_and_episode(title): """ Retorna el numero de temporada y de episodio en formato "1x01" obtenido del titulo de un episodio Ejemplos de diferentes valores para title y su valor devuelto: "serie 101x1.strm", "s101e1.avi", "t101e1.avi" -> '101x01' "Name TvShow 1x6.avi" -> '1x06' "Temp 3 episodio 2.avi" -> '3x02' "Alcantara season 13 episodie 12.avi" -> '13x12' "Temp1 capitulo 14" -> '1x14' "Temporada 1: El origen Episodio 9" -> '' (entre el numero de temporada y los episodios no puede haber otro texto) "Episodio 25: titulo episodio" -> '' (no existe el numero de temporada) "Serie X Temporada 1" -> '' (no existe el numero del episodio) @type title: str @param title: titulo del episodio de una serie @rtype: str @return: Numero de temporada y episodio en formato "1x01" o cadena vacia si no se han encontrado """ filename = "" patrons = ["(\d+)\s*[x-]\s*(\d+)", "(\d+)\s*×\s*(\d+)", "(?:s|t)(\d+)e(\d+)", "(?:season|temp\w*)\s*(\d+)\s*(?:capitulo|epi|episode\w*)\s*(\d+)"] for patron in patrons: try: matches = re.compile(patron, re.I).search(title) if matches: if len(matches.group(1)) == 1: filename = matches.group(1) + "x" + matches.group(2).zfill(2) else: filename = matches.group(1).lstrip('0') + "x" + matches.group(2).zfill(2) break except: pass logger.info("'" + title + "' -> '" + filename + "'") return filename