This commit is contained in:
marco
2019-12-27 23:03:21 +01:00
parent e1c3900039
commit 0692132a3d
47 changed files with 218 additions and 3296 deletions
-112
View File
@@ -1,112 +0,0 @@
# -*- coding: utf-8 -*-
# --------------------------------------------------------------------------------
# Cloudflare decoder
# --------------------------------------------------------------------------------
import re
import time
import urllib
import urlparse
from platformcode import logger
class Cloudflare:
def __init__(self, response):
self.timeout = 5
self.domain = urlparse.urlparse(response["url"])[1]
self.protocol = urlparse.urlparse(response["url"])[0]
self.js_data = {}
self.header_data = {}
if not "var s,t,o,p,b,r,e,a,k,i,n,g,f" in response["data"] or "chk_jschl" in response["url"]:
return
try:
self.js_data["data"] = response["data"]
self.js_data["auth_url"] = \
re.compile('<form id="challenge-form" action="([^"]+)" method="get">').findall(response["data"])[0]
self.js_data["params"] = {}
self.js_data["params"]["jschl_vc"] = \
re.compile('<input type="hidden" name="jschl_vc" value="([^"]+)"/>').findall(response["data"])[0]
self.js_data["params"]["pass"] = \
re.compile('<input type="hidden" name="pass" value="([^"]+)"/>').findall(response["data"])[0]
self.js_data["wait"] = int(re.compile("\}, ([\d]+)\);", re.MULTILINE).findall(response["data"])[0]) / 1000
self.js_data["params"]["s"] = \
re.compile('<input type="hidden" name="s" value="([^"]+)"').findall(response["data"])[0]
except:
logger.debug("Metodo #1 (javascript): NO disponible")
self.js_data = {}
if "refresh" in response["headers"]:
try:
self.header_data["wait"] = int(response["headers"]["refresh"].split(";")[0])
self.header_data["auth_url"] = response["headers"]["refresh"].split("=")[1].split("?")[0]
self.header_data["params"] = {}
self.header_data["params"]["pass"] = response["headers"]["refresh"].split("=")[2]
except:
logger.debug("Metodo #2 (headers): NO disponible")
self.header_data = {}
def solve_cf(self, body, domain):
js = re.search(
r"setTimeout\(function\(\){\s+(var s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n",
body
).group(1)
js = re.sub(r"a\.value = ((.+).toFixed\(10\))?", r"\1", js)
js = re.sub(r'(e\s=\sfunction\(s\)\s{.*?};)', '', js, flags=re.DOTALL|re.MULTILINE)
js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js).replace("t.length", str(len(domain)))
js = js.replace('; 121', '')
js = re.sub(r"[\n\\']", "", js)
jsEnv = """
var t = "{domain}";
var g = String.fromCharCode;
o = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
e = function(s) {{
s += "==".slice(2 - (s.length & 3));
var bm, r = "", r1, r2, i = 0;
for (; i < s.length;) {{
bm = o.indexOf(s.charAt(i++)) << 18 | o.indexOf(s.charAt(i++)) << 12 | (r1 = o.indexOf(s.charAt(i++))) << 6 | (r2 = o.indexOf(s.charAt(i++)));
r += r1 === 64 ? g(bm >> 16 & 255) : r2 === 64 ? g(bm >> 16 & 255, bm >> 8 & 255) : g(bm >> 16 & 255, bm >> 8 & 255, bm & 255);
}}
return r;
}};
function italics (str) {{ return '<i>' + this + '</i>'; }};
var document = {{
getElementById: function () {{
return {{'innerHTML': '{innerHTML}'}};
}}
}};
{js}
"""
innerHTML = re.search('<div(?: [^<>]*)? id="([^<>]*?)">([^<>]*?)<\/div>', body , re.MULTILINE | re.DOTALL)
innerHTML = innerHTML.group(2).replace("'", r"\'") if innerHTML else ""
import js2py
from jsc import jsunc
js = jsunc(jsEnv.format(domain=domain, innerHTML=innerHTML, js=js))
def atob(s):
return base64.b64decode('{}'.format(s)).decode('utf-8')
js2py.disable_pyimport()
context = js2py.EvalJs({'atob': atob})
result = context.eval(js)
return float(result)
@property
def wait_time(self):
if self.js_data.get("wait", 0):
return self.js_data["wait"]
else:
return self.header_data.get("wait", 0)
@property
def is_cloudflare(self):
return self.header_data.get("wait", 0) > 0 or self.js_data.get("wait", 0) > 0
def get_url(self):
# Metodo #1 (javascript)
if self.js_data.get("wait", 0):
self.js_data["params"]["jschl_answer"] = self.solve_cf(self.js_data["data"], self.domain)
response = "%s://%s%s?%s" % (
self.protocol, self.domain, self.js_data["auth_url"], urllib.urlencode(self.js_data["params"]))
time.sleep(self.js_data["wait"])
return response
+7 -7
View File
@@ -17,7 +17,7 @@ from threading import Lock
from core.jsontools import to_utf8
from platformcode import config, logger
from platformcode.logger import WebErrorException
from core import scrapertoolsV2
from core import scrapertools
# Get the addon version
__version = config.get_addon_version()
@@ -48,7 +48,7 @@ def get_user_agent():
def get_url_headers(url, forced=False):
domain = urlparse.urlparse(url)[1]
sub_dom = scrapertoolsV2.find_single_match(domain, r'\.(.*?\.\w+)')
sub_dom = scrapertools.find_single_match(domain, r'\.(.*?\.\w+)')
if sub_dom and not 'google' in url:
domain = sub_dom
domain_cookies = cj._cookies.get("." + domain, {}).get("/", {})
@@ -159,16 +159,16 @@ def channel_proxy_list(url, forced_proxy=None):
if not url.endswith('/'):
url += '/'
if scrapertoolsV2.find_single_match(url, r'(?:http.*\:)?\/\/(?:www\.)?([^\?|\/]+)(?:\?|\/)') \
if scrapertools.find_single_match(url, r'(?:http.*\:)?\/\/(?:www\.)?([^\?|\/]+)(?:\?|\/)') \
in proxy_channel_bloqued:
if forced_proxy and forced_proxy not in ['Total', 'ProxyDirect', 'ProxyCF', 'ProxyWeb']:
if forced_proxy in proxy_channel_bloqued[scrapertoolsV2.find_single_match(url, r'(?:http.*\:)?\/\/(?:www\.)?([^\?|\/]+)(?:\?|\/)')]:
if forced_proxy in proxy_channel_bloqued[scrapertools.find_single_match(url, r'(?:http.*\:)?\/\/(?:www\.)?([^\?|\/]+)(?:\?|\/)')]:
return True
else:
return False
if forced_proxy:
return True
if not 'OFF' in proxy_channel_bloqued[scrapertoolsV2.find_single_match(url, r'(?:http.*\:)?\/\/(?:www\.)?([^\?|\/]+)(?:\?|\/)')]:
if not 'OFF' in proxy_channel_bloqued[scrapertools.find_single_match(url, r'(?:http.*\:)?\/\/(?:www\.)?([^\?|\/]+)(?:\?|\/)')]:
return True
return False
@@ -248,7 +248,7 @@ def check_proxy(url, **opt):
proxy_data['log'] = proxytools.get_proxy_addr(url, post=opt.get('post', None), forced_proxy=forced_proxy)
if proxy_addr_forced and proxy_data['log']:
proxy_data['log'] = scrapertoolsV2.find_single_match(str(proxy_addr_forced), r"{'http.*':\s*'(.*?)'}")
proxy_data['log'] = scrapertools.find_single_match(str(proxy_addr_forced), r"{'http.*':\s*'(.*?)'}")
if proxy and proxy_data['addr']:
if proxy_addr_forced: proxy_data['addr'] = proxy_addr_forced
@@ -564,7 +564,7 @@ def downloadpage(url, **opt):
save_cookies(alfa_s=opt.get('alfa_s', False))
# is_channel = inspect.getmodule(inspect.currentframe().f_back)
# is_channel = scrapertoolsV2.find_single_match(str(is_channel), "<module '(channels).*?'")
# is_channel = scrapertools.find_single_match(str(is_channel), "<module '(channels).*?'")
# if is_channel and isinstance(response_code, int):
# if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''):
# if response_code > 399:
+76 -82
View File
@@ -1,27 +1,17 @@
# -*- coding: utf-8 -*-
# --------------------------------------------------------------------------------
# Scraper tools for reading and processing web elements
# Scraper tools v2 for reading and processing web elements
# --------------------------------------------------------------------------------
import re
import time
from core import httptools
import urlparse
from core.entities import html5
from platformcode import logger
def get_header_from_response(url, header_to_get="", post=None, headers=None):
header_to_get = header_to_get.lower()
response = httptools.downloadpage(url, post=post, headers=headers, only_headers=True)
return response.headers.get(header_to_get)
def read_body_and_headers(url, post=None, headers=None, follow_redirects=False, timeout=None):
response = httptools.downloadpage(url, post=post, headers=headers, follow_redirects=follow_redirects,
timeout=timeout)
return response.data, response.headers
def printMatches(matches):
i = 0
for match in matches:
@@ -42,8 +32,37 @@ def find_multiple_matches(text, pattern):
return re.findall(pattern, text, re.DOTALL)
def entityunescape(cadena):
return unescape(cadena)
def find_multiple_matches_groups(text, pattern):
r = re.compile(pattern)
return [m.groupdict() for m in r.finditer(text)]
# Convierte los codigos html "&ntilde;" y lo reemplaza por "ñ" caracter unicode utf-8
def decodeHtmlentities(data):
entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8})(;?)")
def substitute_entity(match):
ent = match.group(2) + match.group(3)
res = ""
while not ent in html5 and not ent.endswith(";") and match.group(1) != "#":
# Excepción para cuando '&' se usa como argumento en la urls contenidas en los datos
try:
res = ent[-1] + res
ent = ent[:-1]
except:
break
if match.group(1) == "#":
ent = unichr(int(ent.replace(";", "")))
return ent.encode('utf-8')
else:
cp = html5.get(ent)
if cp:
return cp.decode("unicode-escape").encode('utf-8') + res
else:
return match.group()
return entity_re.subn(substitute_entity, data)[0]
def unescape(text):
@@ -84,47 +103,6 @@ def unescape(text):
# Convierte los codigos html "&ntilde;" y lo reemplaza por "ñ" caracter unicode utf-8
def decodeHtmlentities(string):
string = entitiesfix(string)
entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
def substitute_entity(match):
from htmlentitydefs import name2codepoint as n2cp
ent = match.group(2)
if match.group(1) == "#":
return unichr(int(ent)).encode('utf-8')
else:
cp = n2cp.get(ent)
if cp:
return unichr(cp).encode('utf-8')
else:
return match.group()
return entity_re.subn(substitute_entity, string)[0]
def entitiesfix(string):
# Las entidades comienzan siempre con el símbolo & , y terminan con un punto y coma ( ; ).
string = string.replace("&aacute", "&aacute;")
string = string.replace("&eacute", "&eacute;")
string = string.replace("&iacute", "&iacute;")
string = string.replace("&oacute", "&oacute;")
string = string.replace("&uacute", "&uacute;")
string = string.replace("&Aacute", "&Aacute;")
string = string.replace("&Eacute", "&Eacute;")
string = string.replace("&Iacute", "&Iacute;")
string = string.replace("&Oacute", "&Oacute;")
string = string.replace("&Uacute", "&Uacute;")
string = string.replace("&uuml", "&uuml;")
string = string.replace("&Uuml", "&Uuml;")
string = string.replace("&ntilde", "&ntilde;")
string = string.replace("&#191", "&#191;")
string = string.replace("&#161", "&#161;")
string = string.replace(";;", ";")
return string
def htmlclean(cadena):
cadena = re.compile("<!--.*?-->", re.DOTALL).sub("", cadena)
@@ -226,7 +204,7 @@ def htmlclean(cadena):
cadena = re.compile("<link[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("\t", "")
cadena = entityunescape(cadena)
# cadena = entityunescape(cadena)
return cadena
@@ -314,8 +292,8 @@ def remove_show_from_title(title, show):
return title
# scrapertools.get_filename_from_url(media_url)[-4:]
def get_filename_from_url(url):
import urlparse
parsed_url = urlparse.urlparse(url)
try:
filename = parsed_url.path
@@ -332,19 +310,18 @@ def get_filename_from_url(url):
return filename
# def get_domain_from_url(url):
# import urlparse
# parsed_url = urlparse.urlparse(url)
# try:
# filename = parsed_url.netloc
# except:
# # Si falla es porque la implementación de parsed_url no reconoce los atributos como "path"
# if len(parsed_url) >= 4:
# filename = parsed_url[1]
# else:
# filename = ""
#
# return filename
def get_domain_from_url(url):
parsed_url = urlparse.urlparse(url)
try:
filename = parsed_url.netloc
except:
# Si falla es porque la implementación de parsed_url no reconoce los atributos como "path"
if len(parsed_url) >= 4:
filename = parsed_url[1]
else:
filename = ""
return filename
def get_season_and_episode(title):
@@ -365,22 +342,15 @@ def get_season_and_episode(title):
@return: Numero de temporada y episodio en formato "1x01" o cadena vacia si no se han encontrado
"""
filename = ""
# 4l3x87 - fix for series example 9-1-1
# original_title = title
# title = title.replace('9-1-1','')
patrons = ["(\d+)\s*[x-]\s*(\d+)", "(\d+)\s*×\s*(\d+)", "(?:s|t)(\d+)e(\d+)",
"(?:season|temp|stagione\w*)\s*(\d+)\s*(?:capitulo|epi|episode|episodio\w*)\s*(\d+)"]
patrons = ["(\d+)x(\d+)", "(?:s|t)(\d+)e(\d+)",
"(?:season|temp\w*)\s*(\d+)\s*(?:capitulo|epi\w*)\s*(\d+)"]
for patron in patrons:
try:
matches = re.compile(patron, re.I).search(title)
if matches:
if len(matches.group(1)) == 1:
filename = matches.group(1) + "x" + matches.group(2).zfill(2)
else:
filename = matches.group(1).lstrip('0') + "x" + matches.group(2).zfill(2)
filename = matches.group(1) + "x" + matches.group(2).zfill(2)
break
except:
pass
@@ -388,3 +358,27 @@ def get_season_and_episode(title):
logger.info("'" + title + "' -> '" + filename + "'")
return filename
def get_sha1(cadena):
try:
import hashlib
devuelve = hashlib.sha1(cadena).hexdigest()
except:
import sha
import binascii
devuelve = binascii.hexlify(sha.new(cadena).digest())
return devuelve
def get_md5(cadena):
try:
import hashlib
devuelve = hashlib.md5(cadena).hexdigest()
except:
import md5
import binascii
devuelve = binascii.hexlify(md5.new(cadena).digest())
return devuelve
-346
View File
@@ -1,346 +0,0 @@
# -*- coding: utf-8 -*-
# --------------------------------------------------------------------------------
# Scraper tools v2 for reading and processing web elements
# --------------------------------------------------------------------------------
import re
import time
import urlparse
from core.entities import html5
from platformcode import logger
def printMatches(matches):
i = 0
for match in matches:
logger.info("%d %s" % (i, match))
i = i + 1
def find_single_match(data, patron, index=0):
try:
matches = re.findall(patron, data, flags=re.DOTALL)
return matches[index]
except:
return ""
# Parse string and extracts multiple matches using regular expressions
def find_multiple_matches(text, pattern):
return re.findall(pattern, text, re.DOTALL)
def find_multiple_matches_groups(text, pattern):
r = re.compile(pattern)
return [m.groupdict() for m in r.finditer(text)]
# Convierte los codigos html "&ntilde;" y lo reemplaza por "ñ" caracter unicode utf-8
def decodeHtmlentities(data):
entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8})(;?)")
def substitute_entity(match):
ent = match.group(2) + match.group(3)
res = ""
while not ent in html5 and not ent.endswith(";") and match.group(1) != "#":
# Excepción para cuando '&' se usa como argumento en la urls contenidas en los datos
try:
res = ent[-1] + res
ent = ent[:-1]
except:
break
if match.group(1) == "#":
ent = unichr(int(ent.replace(";", "")))
return ent.encode('utf-8')
else:
cp = html5.get(ent)
if cp:
return cp.decode("unicode-escape").encode('utf-8') + res
else:
return match.group()
return entity_re.subn(substitute_entity, data)[0]
def htmlclean(cadena):
cadena = re.compile("<!--.*?-->", re.DOTALL).sub("", cadena)
cadena = cadena.replace("<center>", "")
cadena = cadena.replace("</center>", "")
cadena = cadena.replace("<cite>", "")
cadena = cadena.replace("</cite>", "")
cadena = cadena.replace("<em>", "")
cadena = cadena.replace("</em>", "")
cadena = cadena.replace("<u>", "")
cadena = cadena.replace("</u>", "")
cadena = cadena.replace("<li>", "")
cadena = cadena.replace("</li>", "")
cadena = cadena.replace("<turl>", "")
cadena = cadena.replace("</tbody>", "")
cadena = cadena.replace("<tr>", "")
cadena = cadena.replace("</tr>", "")
cadena = cadena.replace("<![CDATA[", "")
cadena = cadena.replace("<wbr>", "")
cadena = cadena.replace("<Br />", " ")
cadena = cadena.replace("<BR />", " ")
cadena = cadena.replace("<Br>", " ")
cadena = re.compile("<br[^>]*>", re.DOTALL).sub(" ", cadena)
cadena = re.compile("<script.*?</script>", re.DOTALL).sub("", cadena)
cadena = re.compile("<option[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</option>", "")
cadena = re.compile("<button[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</button>", "")
cadena = re.compile("<i[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</iframe>", "")
cadena = cadena.replace("</i>", "")
cadena = re.compile("<table[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</table>", "")
cadena = re.compile("<td[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</td>", "")
cadena = re.compile("<div[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</div>", "")
cadena = re.compile("<dd[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</dd>", "")
cadena = re.compile("<b[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</b>", "")
cadena = re.compile("<font[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</font>", "")
cadena = re.compile("<strong[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</strong>", "")
cadena = re.compile("<small[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</small>", "")
cadena = re.compile("<span[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</span>", "")
cadena = re.compile("<a[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</a>", "")
cadena = re.compile("<p[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</p>", "")
cadena = re.compile("<ul[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</ul>", "")
cadena = re.compile("<h1[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</h1>", "")
cadena = re.compile("<h2[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</h2>", "")
cadena = re.compile("<h3[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</h3>", "")
cadena = re.compile("<h4[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</h4>", "")
cadena = re.compile("<!--[^-]+-->", re.DOTALL).sub("", cadena)
cadena = re.compile("<img[^>]*>", re.DOTALL).sub("", cadena)
cadena = re.compile("<object[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</object>", "")
cadena = re.compile("<param[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</param>", "")
cadena = re.compile("<embed[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</embed>", "")
cadena = re.compile("<title[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("</title>", "")
cadena = re.compile("<link[^>]*>", re.DOTALL).sub("", cadena)
cadena = cadena.replace("\t", "")
# cadena = entityunescape(cadena)
return cadena
def slugify(title):
# print title
# Sustituye acentos y eñes
title = title.replace("Á", "a")
title = title.replace("É", "e")
title = title.replace("Í", "i")
title = title.replace("Ó", "o")
title = title.replace("Ú", "u")
title = title.replace("á", "a")
title = title.replace("é", "e")
title = title.replace("í", "i")
title = title.replace("ó", "o")
title = title.replace("ú", "u")
title = title.replace("À", "a")
title = title.replace("È", "e")
title = title.replace("Ì", "i")
title = title.replace("Ò", "o")
title = title.replace("Ù", "u")
title = title.replace("à", "a")
title = title.replace("è", "e")
title = title.replace("ì", "i")
title = title.replace("ò", "o")
title = title.replace("ù", "u")
title = title.replace("ç", "c")
title = title.replace("Ç", "C")
title = title.replace("Ñ", "n")
title = title.replace("ñ", "n")
title = title.replace("/", "-")
title = title.replace("&amp;", "&")
# Pasa a minúsculas
title = title.lower().strip()
# Elimina caracteres no válidos
validchars = "abcdefghijklmnopqrstuvwxyz1234567890- "
title = ''.join(c for c in title if c in validchars)
# Sustituye espacios en blanco duplicados y saltos de línea
title = re.compile("\s+", re.DOTALL).sub(" ", title)
# Sustituye espacios en blanco por guiones
title = re.compile("\s", re.DOTALL).sub("-", title.strip())
# Sustituye espacios en blanco duplicados y saltos de línea
title = re.compile("\-+", re.DOTALL).sub("-", title)
# Arregla casos especiales
if title.startswith("-"):
title = title[1:]
if title == "":
title = "-" + str(time.time())
return title
def remove_htmltags(string):
return re.sub('<[^<]+?>', '', string)
def remove_show_from_title(title, show):
# print slugify(title)+" == "+slugify(show)
# Quita el nombre del programa del título
if slugify(title).startswith(slugify(show)):
# Convierte a unicode primero, o el encoding se pierde
title = unicode(title, "utf-8", "replace")
show = unicode(show, "utf-8", "replace")
title = title[len(show):].strip()
if title.startswith("-"):
title = title[1:].strip()
if title == "":
title = str(time.time())
# Vuelve a utf-8
title = title.encode("utf-8", "ignore")
show = show.encode("utf-8", "ignore")
return title
# scrapertools.get_filename_from_url(media_url)[-4:]
def get_filename_from_url(url):
parsed_url = urlparse.urlparse(url)
try:
filename = parsed_url.path
except:
# Si falla es porque la implementación de parsed_url no reconoce los atributos como "path"
if len(parsed_url) >= 4:
filename = parsed_url[2]
else:
filename = ""
if "/" in filename:
filename = filename.split("/")[-1]
return filename
def get_domain_from_url(url):
parsed_url = urlparse.urlparse(url)
try:
filename = parsed_url.netloc
except:
# Si falla es porque la implementación de parsed_url no reconoce los atributos como "path"
if len(parsed_url) >= 4:
filename = parsed_url[1]
else:
filename = ""
return filename
def get_season_and_episode(title):
"""
Retorna el numero de temporada y de episodio en formato "1x01" obtenido del titulo de un episodio
Ejemplos de diferentes valores para title y su valor devuelto:
"serie 101x1.strm", "s101e1.avi", "t101e1.avi" -> '101x01'
"Name TvShow 1x6.avi" -> '1x06'
"Temp 3 episodio 2.avi" -> '3x02'
"Alcantara season 13 episodie 12.avi" -> '13x12'
"Temp1 capitulo 14" -> '1x14'
"Temporada 1: El origen Episodio 9" -> '' (entre el numero de temporada y los episodios no puede haber otro texto)
"Episodio 25: titulo episodio" -> '' (no existe el numero de temporada)
"Serie X Temporada 1" -> '' (no existe el numero del episodio)
@type title: str
@param title: titulo del episodio de una serie
@rtype: str
@return: Numero de temporada y episodio en formato "1x01" o cadena vacia si no se han encontrado
"""
filename = ""
patrons = ["(\d+)x(\d+)", "(?:s|t)(\d+)e(\d+)",
"(?:season|temp\w*)\s*(\d+)\s*(?:capitulo|epi\w*)\s*(\d+)"]
for patron in patrons:
try:
matches = re.compile(patron, re.I).search(title)
if matches:
filename = matches.group(1) + "x" + matches.group(2).zfill(2)
break
except:
pass
logger.info("'" + title + "' -> '" + filename + "'")
return filename
def get_sha1(cadena):
try:
import hashlib
devuelve = hashlib.sha1(cadena).hexdigest()
except:
import sha
import binascii
devuelve = binascii.hexlify(sha.new(cadena).digest())
return devuelve
def get_md5(cadena):
try:
import hashlib
devuelve = hashlib.md5(cadena).hexdigest()
except:
import md5
import binascii
devuelve = binascii.hexlify(md5.new(cadena).digest())
return devuelve
+2 -2
View File
@@ -506,8 +506,8 @@ def get_server_json(server_name):
def get_server_host(server_name):
from core import scrapertoolsV2
return [scrapertoolsV2.get_domain_from_url(pattern['url']) for pattern in get_server_json(server_name)['find_videos']['patterns']]
from core import scrapertools
return [scrapertools.get_domain_from_url(pattern['url']) for pattern in get_server_json(server_name)['find_videos']['patterns']]
def get_server_controls_settings(server_name):
+31 -31
View File
@@ -10,7 +10,7 @@ import urlparse
import xbmcaddon
from channelselector import thumb
from core import httptools, scrapertoolsV2, servertools, tmdb, channeltools
from core import httptools, scrapertools, servertools, tmdb, channeltools
from core.item import Item
from lib import unshortenit
from platformcode import logger, config
@@ -21,7 +21,7 @@ def hdpass_get_servers(item):
itemlist = []
data = httptools.downloadpage(item.url).data.replace('\n', '')
patron = r'<iframe(?: id="[^"]+")? width="[^"]+" height="[^"]+" src="([^"]+)"[^>]+><\/iframe>'
url = scrapertoolsV2.find_single_match(data, patron).replace("?alta", "")
url = scrapertools.find_single_match(data, patron).replace("?alta", "")
url = url.replace("&download=1", "")
if 'https' not in url:
url = 'https:' + url
@@ -37,20 +37,20 @@ def hdpass_get_servers(item):
patron_mir = '<div class="row mobileMirrs">(.*?)</div>'
patron_media = r'<input type="hidden" name="urlEmbed" data-mirror="([^"]+)" id="urlEmbed"\s*value="([^"]+)"\s*/>'
res = scrapertoolsV2.find_single_match(data, patron_res)
res = scrapertools.find_single_match(data, patron_res)
itemlist = []
for res_url, res_video in scrapertoolsV2.find_multiple_matches(res, '<option.*?value="([^"]+?)">([^<]+?)</option>'):
for res_url, res_video in scrapertools.find_multiple_matches(res, '<option.*?value="([^"]+?)">([^<]+?)</option>'):
data = httptools.downloadpage(urlparse.urljoin(url, res_url)).data.replace('\n', '')
mir = scrapertoolsV2.find_single_match(data, patron_mir)
mir = scrapertools.find_single_match(data, patron_mir)
for mir_url, srv in scrapertoolsV2.find_multiple_matches(mir, '<option.*?value="([^"]+?)">([^<]+?)</value>'):
for mir_url, srv in scrapertools.find_multiple_matches(mir, '<option.*?value="([^"]+?)">([^<]+?)</value>'):
data = httptools.downloadpage(urlparse.urljoin(url, mir_url)).data.replace('\n', '')
for media_label, media_url in scrapertoolsV2.find_multiple_matches(data, patron_media):
for media_label, media_url in scrapertools.find_multiple_matches(data, patron_media):
itemlist.append(Item(channel=item.channel,
action="play",
fulltitle=item.fulltitle,
@@ -168,13 +168,13 @@ def scrapeLang(scraped, lang, longtitle):
return language, longtitle
def cleantitle(title):
cleantitle = scrapertoolsV2.htmlclean(scrapertoolsV2.decodeHtmlentities(title).replace('"', "'").replace('×', 'x').replace('', '-')).strip()
cleantitle = scrapertools.htmlclean(scrapertools.decodeHtmlentities(title).replace('"', "'").replace('×', 'x').replace('', '-')).strip()
return cleantitle
def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, typeContentDict, typeActionDict, blacklist, search, pag, function, lang):
itemlist = []
log("scrapeBlock qui", block, patron)
matches = scrapertoolsV2.find_multiple_matches_groups(block, patron)
matches = scrapertools.find_multiple_matches_groups(block, patron)
log('MATCHES =', matches)
if debug:
@@ -214,7 +214,7 @@ def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, t
for kk in known_keys:
val = match[listGroups.index(kk)] if kk in listGroups else ''
if val and (kk == "url" or kk == 'thumb') and 'http' not in val:
val = scrapertoolsV2.find_single_match(item.url, 'https?://[a-z0-9.-]+') + val
val = scrapertools.find_single_match(item.url, 'https?://[a-z0-9.-]+') + val
scraped[kk] = val
if scraped['season']:
@@ -227,7 +227,7 @@ def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, t
episode = ''
else:
episode = re.sub(r'\s-\s|-|x|&#8211|&#215;|×', 'x', scraped['episode']) if scraped['episode'] else ''
second_episode = scrapertoolsV2.find_single_match(episode,'x\d+x(\d+)')
second_episode = scrapertools.find_single_match(episode, 'x\d+x(\d+)')
if second_episode: episode = re.sub(r'(\d+x\d+)x\d+',r'\1-', episode) + second_episode.zfill(2)
#episode = re.sub(r'\s-\s|-|x|&#8211|&#215;', 'x', scraped['episode']) if scraped['episode'] else ''
@@ -257,18 +257,18 @@ def scrapeBlock(item, args, block, patron, headers, action, pagination, debug, t
if scraped["plot"]:
infolabels['plot'] = plot
if scraped['duration']:
matches = scrapertoolsV2.find_multiple_matches(scraped['duration'],
matches = scrapertools.find_multiple_matches(scraped['duration'],
r'([0-9])\s*?(?:[hH]|:|\.|,|\\|\/|\||\s)\s*?([0-9]+)')
for h, m in matches:
scraped['duration'] = int(h) * 60 + int(m)
if not matches:
scraped['duration'] = scrapertoolsV2.find_single_match(scraped['duration'], r'(\d+)')
scraped['duration'] = scrapertools.find_single_match(scraped['duration'], r'(\d+)')
infolabels['duration'] = int(scraped['duration']) * 60
if scraped['genere']:
genres = scrapertoolsV2.find_multiple_matches(scraped['genere'], '[A-Za-z]+')
genres = scrapertools.find_multiple_matches(scraped['genere'], '[A-Za-z]+')
infolabels['genere'] = ", ".join(genres)
if scraped["rating"]:
infolabels['rating'] = scrapertoolsV2.decodeHtmlentities(scraped["rating"])
infolabels['rating'] = scrapertools.decodeHtmlentities(scraped["rating"])
AC = CT = ''
if typeContentDict:
@@ -379,11 +379,11 @@ def scrape(func):
if not data:
page = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True, session=item.session)
# if url may be changed and channel has findhost to update
if (not page.data or scrapertoolsV2.get_domain_from_url(page.url) != scrapertoolsV2.get_domain_from_url(item.url)) and 'findhost' in func.__globals__:
if (not page.data or scrapertools.get_domain_from_url(page.url) != scrapertools.get_domain_from_url(item.url)) and 'findhost' in func.__globals__:
host = func.__globals__['findhost']()
from core import jsontools
jsontools.update_node(host, func.__module__.split('.')[-1], 'url')
item.url = item.url.replace(scrapertoolsV2.get_domain_from_url(item.url), scrapertoolsV2.get_domain_from_url(host))
item.url = item.url.replace(scrapertools.get_domain_from_url(item.url), scrapertools.get_domain_from_url(host))
page = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True,
session=item.session)
data = page.data.replace("'", '"')
@@ -394,7 +394,7 @@ def scrape(func):
if patronBlock:
if debugBlock:
regexDbg(item, patronBlock, headers, data)
blocks = scrapertoolsV2.find_multiple_matches_groups(data, patronBlock)
blocks = scrapertools.find_multiple_matches_groups(data, patronBlock)
block = ""
for bl in blocks:
# log(len(blocks),bl)
@@ -443,7 +443,7 @@ def scrape(func):
if anime:
if function == 'episodios' or item.action == 'episodios': autorenumber.renumber(itemlist, item, 'bold')
else: autorenumber.renumber(itemlist)
if anime and autorenumber.check(item) == False and not scrapertoolsV2.find_single_match(itemlist[0].title, r'(\d+.\d+)'):
if anime and autorenumber.check(item) == False and not scrapertools.find_single_match(itemlist[0].title, r'(\d+.\d+)'):
pass
else:
if addVideolibrary and (item.infoLabels["title"] or item.fulltitle):
@@ -471,7 +471,7 @@ def dooplay_get_links(item, host):
data = httptools.downloadpage(item.url).data.replace("'", '"')
patron = r'<li id="player-option-[0-9]".*?data-type="([^"]+)" data-post="([^"]+)" data-nume="([^"]+)".*?<span class="title".*?>([^<>]+)</span>(?:<span class="server">([^<>]+))?'
matches = scrapertoolsV2.find_multiple_matches(data, patron)
matches = scrapertools.find_multiple_matches(data, patron)
ret = []
@@ -483,7 +483,7 @@ def dooplay_get_links(item, host):
"type": type
})
dataAdmin = httptools.downloadpage(host + '/wp-admin/admin-ajax.php', post=postData,headers={'Referer': item.url}).data
link = scrapertoolsV2.find_single_match(dataAdmin, "<iframe.*src='([^']+)'")
link = scrapertools.find_single_match(dataAdmin, "<iframe.*src='([^']+)'")
ret.append({
'url': link,
'title': title,
@@ -560,25 +560,25 @@ def swzz_get_url(item):
if "/link/" in item.url:
data = httptools.downloadpage(item.url, headers=headers).data
if "link =" in data:
data = scrapertoolsV2.find_single_match(data, 'link = "([^"]+)"')
data = scrapertools.find_single_match(data, 'link = "([^"]+)"')
if 'http' not in data:
data = 'https:' + data
else:
match = scrapertoolsV2.find_single_match(data, r'<meta name="og:url" content="([^"]+)"')
match = scrapertoolsV2.find_single_match(data, r'URL=([^"]+)">') if not match else match
match = scrapertools.find_single_match(data, r'<meta name="og:url" content="([^"]+)"')
match = scrapertools.find_single_match(data, r'URL=([^"]+)">') if not match else match
if not match:
from lib import jsunpack
try:
data = scrapertoolsV2.find_single_match(data.replace('\n', ''), r"(eval\s?\(function\(p,a,c,k,e,d.*?)</script>")
data = scrapertools.find_single_match(data.replace('\n', ''), r"(eval\s?\(function\(p,a,c,k,e,d.*?)</script>")
data = jsunpack.unpack(data)
logger.debug("##### play /link/ unpack ##\n%s\n##" % data)
except:
logger.debug("##### The content is yet unpacked ##\n%s\n##" % data)
data = scrapertoolsV2.find_single_match(data, r'var link(?:\s)?=(?:\s)?"([^"]+)";')
data = scrapertools.find_single_match(data, r'var link(?:\s)?=(?:\s)?"([^"]+)";')
data, c = unshortenit.unwrap_30x_only(data)
else:
data = match
@@ -753,7 +753,7 @@ def typo(string, typography=''):
if 'submenu' in string:
string = u"\u2022\u2022 ".encode('utf-8') + re.sub(r'\ssubmenu','',string)
if 'color' in string:
color = scrapertoolsV2.find_single_match(string,'color ([a-z]+)')
color = scrapertools.find_single_match(string, 'color ([a-z]+)')
if color == 'kod' or '': color = kod_color
string = '[COLOR '+ color +']' + re.sub(r'\scolor\s([a-z]+)','',string) + '[/COLOR]'
if 'bold' in string:
@@ -785,13 +785,13 @@ def match(item, patron='', patronBlock='', headers='', url='', post=''):
log('DATA= ', data)
if patronBlock:
block = scrapertoolsV2.find_single_match(data, patronBlock)
block = scrapertools.find_single_match(data, patronBlock)
log('BLOCK= ',block)
else:
block = data
if patron:
matches = scrapertoolsV2.find_multiple_matches(block, patron)
matches = scrapertools.find_multiple_matches(block, patron)
log('MATCHES= ',matches)
return matches, block
@@ -899,12 +899,12 @@ def nextPage(itemlist, item, data='', patron='', function_or_level=1, next_page=
# If the call is direct, leave it blank
action = inspect.stack()[function_or_level][3] if type(function_or_level) == int else function_or_level
if next_page == '':
next_page = scrapertoolsV2.find_single_match(data, patron)
next_page = scrapertools.find_single_match(data, patron)
if next_page != "":
if resub: next_page = re.sub(resub[0], resub[1], next_page)
if 'http' not in next_page:
next_page = scrapertoolsV2.find_single_match(item.url, 'https?://[a-z0-9.-]+') + next_page
next_page = scrapertools.find_single_match(item.url, 'https?://[a-z0-9.-]+') + next_page
next_page = re.sub('&amp;', '&',next_page)
log('NEXT= ', next_page)
itemlist.append(