# -*- coding: utf-8 -*- # -------------------------------------------------------------------------------- # Scraper tools for reading and processing web elements # -------------------------------------------------------------------------------- #from future import standard_library #standard_library.install_aliases() #from builtins import str #from builtins import chr import sys PY3 = False if sys.version_info[0] >= 3: PY3 = True; unicode = str; unichr = chr; long = int import re import time # from core import httptools from core.entities import html5 from platformcode import logger # def get_header_from_response(url, header_to_get="", post=None, headers=None): # header_to_get = header_to_get.lower() # response = httptools.downloadpage(url, post=post, headers=headers, only_headers=True) # return response.headers.get(header_to_get) # def read_body_and_headers(url, post=None, headers=None, follow_redirects=False, timeout=None): # response = httptools.downloadpage(url, post=post, headers=headers, follow_redirects=follow_redirects, # timeout=timeout) # return response.data, response.headers def printMatches(matches): i = 0 for match in matches: logger.debug("%d %s" % (i, match)) i = i + 1 def find_single_match(data, patron, index=0): try: if index == 0: matches = re.search(patron, data, flags=re.DOTALL) if matches: if len(matches.groups()) == 1: return matches.group(1) elif len(matches.groups()) > 1: return matches.groups() else: return matches.group() else: return "" else: matches = re.findall(patron, data, flags=re.DOTALL) return matches[index] except: return "" # Parse string and extracts multiple matches using regular expressions def find_multiple_matches(text, pattern): return re.findall(pattern, text, re.DOTALL) def find_multiple_matches_groups(text, pattern, flags): r = re.compile(pattern, flags) return [m.groupdict() for m in r.finditer(text)] # Convert html codes "ñ" and replace it with "ñ" unicode utf-8 character def decodeHtmlentities(data): entity_re = re.compile(r"&(#?)(\d{1,5}|\w{1,8})(;?)") def substitute_entity(match): ent = match.group(2) + match.group(3) res = "" while not ent in html5 and not ent.endswith(";") and match.group(1) != "#": # Exception for when '&' is used as an argument in the urls contained in the data try: res = ent[-1] + res ent = ent[:-1] except: break if match.group(1) == "#" and ent.replace(";", "").isdigit(): ent = unichr(int(ent.replace(";", ""))) return ent if PY3 else ent.encode('utf-8') else: cp = html5.get(ent) if cp: if PY3: return cp + res else: return cp.decode("unicode-escape").encode('utf-8') + res else: return match.group() return entity_re.subn(substitute_entity, data)[0] def unescape(text): """ Removes HTML or XML character references and entities from a text string. keep &, >, < in the source code. from Fredrik Lundh http://effbot.org/zone/re-sub.htm#unescape-html """ if not ('&' in text and ';' in text): return text def fixup(m): text = m.group(0) ret = text if text[:2] == "&#": # character reference try: if text[:3] == "&#x": ret = unichr(int(text[3:-1], 16)) return ret if PY3 else ret.encode("utf-8") else: ret = unichr(int(text[2:-1])) return ret if PY3 else ret.encode("utf-8") except ValueError: logger.error("error de valor") pass else: # named entity try: if PY3: import html.entities as htmlentitydefs else: import htmlentitydefs ret = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8") except KeyError: logger.error("keyerror") pass except: pass if type(ret) != str: ret = ret.decode() return ret # leave as is return re.sub("&#?\w+;", fixup, str(text)) # Convert html codes "ñ" and replace it with "ñ" unicode utf-8 character # def decodeHtmlentities(string): # string = entitiesfix(string) # entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") # def substitute_entity(match): # if PY3: # from html.entities import name2codepoint as n2cp # else: # from htmlentitydefs import name2codepoint as n2cp # ent = match.group(2) # if match.group(1) == "#": # return unichr(int(ent)).encode('utf-8') # else: # cp = n2cp.get(ent) # if cp: # return unichr(cp).encode('utf-8') # else: # return match.group() # return entity_re.subn(substitute_entity, string)[0] # def entitiesfix(string): # # Las entidades comienzan siempre con el símbolo & , y terminan con un punto y coma ( ; ). # string = string.replace("á", "á") # string = string.replace("é", "é") # string = string.replace("í", "í") # string = string.replace("ó", "ó") # string = string.replace("ú", "ú") # string = string.replace("Á", "Á") # string = string.replace("É", "É") # string = string.replace("Í", "Í") # string = string.replace("Ó", "Ó") # string = string.replace("Ú", "Ú") # string = string.replace("ü", "ü") # string = string.replace("Ü", "Ü") # string = string.replace("ñ", "ñ") # string = string.replace("¿", "¿") # string = string.replace("¡", "¡") # string = string.replace(";;", ";") # return string def htmlclean(cadena): cadena = re.compile("", re.DOTALL).sub("", cadena) cadena = cadena.replace("
", "") cadena = cadena.replace("
", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("
  • ", "") cadena = cadena.replace("
  • ", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = cadena.replace("
    ", " ") cadena = cadena.replace("
    ", " ") cadena = cadena.replace("
    ", " ") cadena = re.compile("]*>", re.DOTALL).sub(" ", cadena) cadena = re.compile("", re.DOTALL).sub("", cadena) cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("

    ", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("", re.DOTALL).sub("", cadena) cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("", "") cadena = re.compile("]*>", re.DOTALL).sub("", cadena) cadena = cadena.replace("\t", "") # cadena = entityunescape(cadena) return cadena def slugify(title): # print title # Substitutes accents and eñes title = title.replace("Á", "a") title = title.replace("É", "e") title = title.replace("Í", "i") title = title.replace("Ó", "o") title = title.replace("Ú", "u") title = title.replace("á", "a") title = title.replace("é", "e") title = title.replace("í", "i") title = title.replace("ó", "o") title = title.replace("ú", "u") title = title.replace("À", "a") title = title.replace("È", "e") title = title.replace("Ì", "i") title = title.replace("Ò", "o") title = title.replace("Ù", "u") title = title.replace("à", "a") title = title.replace("è", "e") title = title.replace("ì", "i") title = title.replace("ò", "o") title = title.replace("ù", "u") title = title.replace("ç", "c") title = title.replace("Ç", "C") title = title.replace("Ñ", "n") title = title.replace("ñ", "n") title = title.replace("/", "-") title = title.replace("&", "&") # Lowercase title = title.lower().strip() # Remove invalid characters validchars = "abcdefghijklmnopqrstuvwxyz1234567890- " title = ''.join(c for c in title if c in validchars) # Replace duplicate blanks and line breaks title = re.compile(r"\s+", re.DOTALL).sub(" ", title) # Replace blanks with hyphens title = re.compile(r"\s", re.DOTALL).sub("-", title.strip()) # Replace duplicate blanks and line breaks title = re.compile(r"\-+", re.DOTALL).sub("-", title) # Fix special cases if title.startswith("-"): title = title[1:] if title == "": title = "-" + str(time.time()) return title def remove_htmltags(string): return re.sub('<[^<]+?>', '', string) def remove_show_from_title(title, show): # print slugify(title)+" == "+slugify(show) # Remove program name from title if slugify(title).startswith(slugify(show)): # Convert to unicode first, or encoding is lost title = unicode(title, "utf-8", "replace") show = unicode(show, "utf-8", "replace") title = title[len(show):].strip() if title.startswith("-"): title = title[1:].strip() if title == "": title = str(time.time()) # Return to utf-8 title = title.encode("utf-8", "ignore") show = show.encode("utf-8", "ignore") return title def get_filename_from_url(url): if PY3: import urllib.parse as urlparse # It is very slow in PY2. In PY3 it is native else: import urlparse # We use the native of PY2 which is faster parsed_url = urlparse.urlparse(url) try: filename = parsed_url.path except: # If it fails it is because the implementation of parsed_url does not recognize the attributes as "path" if len(parsed_url) >= 4: filename = parsed_url[2] else: filename = "" if "/" in filename: filename = filename.split("/")[-1] return filename def get_domain_from_url(url): if PY3: import urllib.parse as urlparse # It is very slow in PY2. In PY3 it is native else: import urlparse # We use the native of PY2 which is faster parsed_url = urlparse.urlparse(url) try: filename = parsed_url.netloc except: # If it fails it is because the implementation of parsed_url does not recognize the attributes as "path" if len(parsed_url) >= 4: filename = parsed_url[1] else: filename = "" return filename def get_season_and_episode(title): """ Returns the season and episode number in "1x01" format obtained from the title of an episode Examples of different values ​​for title and its return value: "serie 101x1.strm", "s101e1.avi", "t101e1.avi" -> '101x01' "Name TvShow 1x6.avi" -> '1x06' "Temp 3 episodio 2.avi" -> '3x02' "Alcantara season 13 episodie 12.avi" -> '13x12' "Temp1 capitulo 14" -> '1x14' "Temporada 1: El origen Episodio 9" -> '' (entre el numero de temporada y los episodios no puede haber otro texto) "Episodio 25: titulo episodio" -> '' (no existe el numero de temporada) "Serie X Temporada 1" -> '' (no existe el numero del episodio) @type title: str @param title: title of a series episode @rtype: str @return: Nseason and episode number in "1x01" format or empty string if not found """ filename = "" patrons = ["(?:[^\w]|^)(\d+)\s*[x-]\s*(\d+)(?:[^\w]|$)", "(\d+)\s*×\s*(\d+)", "(?:s|t)(\d+)[ .]?(?:e|Ep\.?)(\d+)", "(?:(?:stag|season|stagione\w*)\s*(\d+))?\s*(?:ep|epi|epis|episod[ioe]?|puntata)[ .-]*(\d+)"] for patron in patrons: try: matches = re.compile(patron, re.I).search(title) if matches: season = matches.group(1) if not season: season = 1 filename = str(int(season)) + "x" + str(int(matches.group(2))).zfill(2) break except: pass logger.debug("'" + title + "' -> '" + filename + "'") return filename def get_sha1(cadena): try: import hashlib devuelve = hashlib.sha1(cadena).hexdigest() except: import sha import binascii devuelve = binascii.hexlify(sha.new(cadena).digest()) return devuelve def get_md5(cadena): try: import hashlib devuelve = hashlib.md5(cadena).hexdigest() except: import md5 import binascii devuelve = binascii.hexlify(md5.new(cadena).digest()) return devuelve def title_unify(title): import unicodedata u_title = '' if type(title) == str: title = u'' + title for c in unicodedata.normalize('NFD', title): cat = unicodedata.category(c) if cat != 'Mn': if cat == 'Pd': c_new = '-' elif cat in ['Ll', 'Lu', 'Nd'] or c == ':': c_new = c else: c_new = ' ' u_title += c_new if (u_title.count(':') + u_title.count('-')) == 1: # subtitle, split but only if there's one, it might be part of title spl = u_title.replace(':', '-').split('-') u_title = spl[0] if len(spl[0]) > 5 else spl[1] return u_title.strip() def girc(page_data, url, co, size='invisible'): """ Code adapted from https://github.com/vb6rocod/utils/ Copyright (C) 2019 vb6rocod and https://github.com/addon-lab/addon-lab_resolver_Project Copyright (C) 2021 ADDON-LAB, KAR10S """ import re, random, string from core import httptools hdrs = {'Referer': url} rurl = 'https://www.google.com/recaptcha/api.js' aurl = 'https://www.google.com/recaptcha/api2' key = re.search(r"""(?:src="{0}\?.*?render|data-sitekey)=['"]?([^"']+)""".format(rurl), page_data) if key: key = key.group(1) # rurl = '{0}?render={1}'.format(rurl, key) page_data1 = httptools.downloadpage(rurl, headers=hdrs).data v = re.findall('releases/([^/]+)', page_data1)[0] rdata = {'ar': 1, 'k': key, 'co': co, 'hl': 'it', 'v': v, 'size': size, 'sa': 'submit', 'cb': ''.join([random.choice(string.ascii_lowercase + string.digits) for i in range(12)])} page_data2 = httptools.downloadpage('{0}/anchor?{1}'.format(aurl, httptools.urlparse.urlencode(rdata)), headers=hdrs).data rtoken = re.search('recaptcha-token.+?="([^"]+)', page_data2) if rtoken: rtoken = rtoken.group(1) else: return '' pdata = {'v': v, 'reason': 'q', 'k': key, 'c': rtoken, 'sa': '', 'co': co} hdrs.update({'Referer': aurl}) page_data3 = httptools.downloadpage('{0}/reload?k={1}'.format(aurl, key), post=pdata, headers=hdrs).data gtoken = re.search('rresp","([^"]+)', page_data3) if gtoken: return gtoken.group(1) return ''