From 8eb8d118635a803df12a37d73405f6e3cee6136a Mon Sep 17 00:00:00 2001 From: alfa-addon Date: Wed, 26 Feb 2020 21:04:38 +0100 Subject: [PATCH] fix youtube (trailer) --- lib/jsinterpreter.py | 249 +++++++++++++++++++++++++++++++++++++++++++ servers/youtube.py | 223 ++++++++++++++++++++------------------ 2 files changed, 371 insertions(+), 101 deletions(-) create mode 100644 lib/jsinterpreter.py diff --git a/lib/jsinterpreter.py b/lib/jsinterpreter.py new file mode 100644 index 00000000..47bacb49 --- /dev/null +++ b/lib/jsinterpreter.py @@ -0,0 +1,249 @@ +# -*- coding: utf-8 -*- + +import json +import operator +import re + + +_OPERATORS = [ + ('|', operator.or_), + ('^', operator.xor), + ('&', operator.and_), + ('>>', operator.rshift), + ('<<', operator.lshift), + ('-', operator.sub), + ('+', operator.add), + ('%', operator.mod), + ('/', operator.truediv), + ('*', operator.mul), +] + +_ASSIGN_OPERATORS = [] +for op, opfunc in _OPERATORS: + _ASSIGN_OPERATORS.append([op + '=', opfunc]) +_ASSIGN_OPERATORS.append(('=', lambda cur, right: right)) + +_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' + + +class JSInterpreter(object): + def __init__(self, code, objects=None): + if objects is None: + objects = {} + self.code = code + self._functions = {} + self._objects = objects + + def interpret_statement(self, stmt, local_vars, allow_recursion=100): + + should_abort = False + stmt = stmt.lstrip() + stmt_m = re.match(r'var\s', stmt) + if stmt_m: + expr = stmt[len(stmt_m.group(0)):] + else: + return_m = re.match(r'return(?:\s+|$)', stmt) + if return_m: + expr = stmt[len(return_m.group(0)):] + should_abort = True + else: + # Try interpreting it as an expression + expr = stmt + + v = self.interpret_expression(expr, local_vars, allow_recursion) + return v, should_abort + + def interpret_expression(self, expr, local_vars, allow_recursion): + expr = expr.strip() + + if expr == '': # Empty expression + return None + + if expr.startswith('('): + parens_count = 0 + for m in re.finditer(r'[()]', expr): + if m.group(0) == '(': + parens_count += 1 + else: + parens_count -= 1 + if parens_count == 0: + sub_expr = expr[1:m.start()] + sub_result = self.interpret_expression( + sub_expr, local_vars, allow_recursion) + remaining_expr = expr[m.end():].strip() + if not remaining_expr: + return sub_result + else: + expr = json.dumps(sub_result) + remaining_expr + break + + for op, opfunc in _ASSIGN_OPERATORS: + m = re.match(r'''(?x) + (?P%s)(?:\[(?P[^\]]+?)\])? + \s*%s + (?P.*)$''' % (_NAME_RE, re.escape(op)), expr) + if not m: + continue + right_val = self.interpret_expression( + m.group('expr'), local_vars, allow_recursion - 1) + + if m.groupdict().get('index'): + lvar = local_vars[m.group('out')] + idx = self.interpret_expression( + m.group('index'), local_vars, allow_recursion) + assert isinstance(idx, int) + cur = lvar[idx] + val = opfunc(cur, right_val) + lvar[idx] = val + return val + else: + cur = local_vars.get(m.group('out')) + val = opfunc(cur, right_val) + local_vars[m.group('out')] = val + return val + + if expr.isdigit(): + return int(expr) + + var_m = re.match( + r'(?!if|return|true|false)(?P%s)$' % _NAME_RE, + expr) + if var_m: + return local_vars[var_m.group('name')] + + try: + return json.loads(expr) + except ValueError: + pass + + m = re.match( + r'(?P%s)\.(?P[^(]+)(?:\(+(?P[^()]*)\))?$' % _NAME_RE, + expr) + if m: + variable = m.group('var') + member = m.group('member') + arg_str = m.group('args') + + if variable in local_vars: + obj = local_vars[variable] + else: + if variable not in self._objects: + self._objects[variable] = self.extract_object(variable) + obj = self._objects[variable] + + if arg_str is None: + # Member access + if member == 'length': + return len(obj) + return obj[member] + + assert expr.endswith(')') + # Function call + if arg_str == '': + argvals = tuple() + else: + argvals = [] + for v in arg_str.split(','): + argvals.extend([self.interpret_expression(v, local_vars, allow_recursion)]) + + if member == 'split': + assert argvals == ('',) + return list(obj) + if member == 'join': + assert len(argvals) == 1 + return argvals[0].join(obj) + if member == 'reverse': + assert len(argvals) == 0 + obj.reverse() + return obj + if member == 'slice': + assert len(argvals) == 1 + return obj[argvals[0]:] + if member == 'splice': + assert isinstance(obj, list) + index, howMany = argvals + res = [] + for i in range(index, min(index + howMany, len(obj))): + res.append(obj.pop(index)) + return res + + return obj[member](argvals) + + m = re.match( + r'(?P%s)\[(?P.+)\]$' % _NAME_RE, expr) + if m: + val = local_vars[m.group('in')] + idx = self.interpret_expression( + m.group('idx'), local_vars, allow_recursion - 1) + return val[idx] + + for op, opfunc in _OPERATORS: + m = re.match(r'(?P.+?)%s(?P.+)' % re.escape(op), expr) + if not m: + continue + x, abort = self.interpret_statement( + m.group('x'), local_vars, allow_recursion - 1) + y, abort = self.interpret_statement( + m.group('y'), local_vars, allow_recursion - 1) + return opfunc(x, y) + + m = re.match( + r'^(?P%s)\((?P[a-zA-Z0-9_$,]+)\)$' % _NAME_RE, expr) + if m: + fname = m.group('func') + argvals = [] + for v in m.group('args').split(','): + if v.isdigit(): + argvals.append([int(v)]) + else: + argvals.append([local_vars[v]]) + + if fname not in self._functions: + self._functions[fname] = self.extract_function(fname) + return self._functions[fname](argvals) + + + def extract_object(self, objname): + obj = {} + obj_m = re.search( + (r'(?:var\s+)?%s\s*=\s*\{' % re.escape(objname)) + + r'\s*(?P([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)' + + r'\}\s*;', + self.code) + fields = obj_m.group('fields') + # Currently, it only supports function definitions + fields_m = re.finditer( + r'(?P[a-zA-Z$0-9]+)\s*:\s*function' + r'\((?P[a-z,]+)\){(?P[^}]+)}', + fields) + for f in fields_m: + argnames = f.group('args').split(',') + obj[f.group('key')] = self.build_function(argnames, f.group('code')) + + return obj + + def extract_function(self, funcname): + func_m = re.search( + r'''(?x) + (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + \((?P[^)]*)\)\s* + \{(?P[^}]+)\}''' % ( + re.escape(funcname), re.escape(funcname), re.escape(funcname)), + self.code) + argnames = func_m.group('args').split(',') + + return self.build_function(argnames, func_m.group('code')) + + def call_function(self, funcname, *args): + f = self.extract_function(funcname) + return f(args) + + def build_function(self, argnames, code): + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in code.split(';'): + res, abort = self.interpret_statement(stmt, local_vars) + if abort: + break + return res + return resf \ No newline at end of file diff --git a/servers/youtube.py b/servers/youtube.py index 9e005a67..34aa0b56 100644 --- a/servers/youtube.py +++ b/servers/youtube.py @@ -1,12 +1,19 @@ # s-*- coding: utf-8 -*- -import re -import urllib +import sys +PY3 = False +if sys.version_info[0] >= 3: PY3 = True; unicode = str; unichr = chr; long = int -try: - import urlparse -except: +if PY3: + #from future import standard_library + #standard_library.install_aliases() + import urllib.parse as urllib # Es muy lento en PY2. En PY3 es nativo import urllib.parse as urlparse +else: + import urllib # Usamos el nativo de PY2 que es más rápido + import urlparse + +import re from core import httptools from core import jsontools as json @@ -14,6 +21,72 @@ from core import scrapertools from platformcode import config, logger +itag_list = {1: "video", + 5: "flv 240p", + 6: "flv 270p", + 17: "3gp 144p", + 18: "mp4 360p", + 22: "mp4 720p", + 34: "flv 360p", + 35: "flv 480p", + 36: "3gp 180p", + 37: "mp4 1080p", + 38: "mp4 3072p", + 43: "webm 360p", + 44: "webm 480p", + 45: "webm 720p", + 46: "webm 1080p", + 82: "mp4 360p 3D", + 83: "mp4 480p 3D", + 84: "mp4 720p 3D", + 85: "mp4 1080p 3D", + 92: "hls 240p 3D", + 93: "hls 360p 3D", + 94: "hls 480p 3D", + 95: "hls 720p 3D", + 96: "hls 1080p", + 100: "webm 360p 3D", + 101: "webm 480p 3D", + 102: "webm 720p 3D", + 132: "hls 240p", + 133: "mp4 240p", + 134: "mp4 360p", + 135: "mp4 480p", + 136: "mp4 720p", + 137: "mp4 1080p", + 138: "mp4 2160p", + 160: "mp4 144p", + 167: "webm 360p", + 168: "webm 480p", + 169: "webm 1080p", + 219: "webm 144p", + 242: "webm 240p", + 243: "webm 360p", + 244: "webm 480p", + 245: "webm 480p", + 246: "webm 480p", + 247: "webm 720p", + 248: "webm 1080p", + 266: "mp4 2160p", + 271: "webm 1440p", + 272: "webm 4320p", + 278: "webm 144p", + 298: "mp4 720p", + 299: "mp4 1080p", + 302: "webm 720p", + 303: "webm 1080p", + 308: "webm 1440p", + 313: "webm 2160p", + 315: "webm 2160p", + 330: "webm 144p hdr", + 331: "webm 240p hdr", + 332: "webm 360p hdr", + 333: "webm 480p hdr", + 334: "webm 720p hdr", + 335: "webm 1080p hdr", + 336: "webm 1440p hdr"} + + def test_video_exists(page_url): logger.info("(page_url='%s')" % page_url) @@ -21,11 +94,9 @@ def test_video_exists(page_url): if "File was deleted" in data: return False, config.get_localized_string(70449) % "Youtube" - return True, "" - def get_video_url(page_url, premium=False, user="", password="", video_password=""): logger.info("(page_url='%s')" % page_url) @@ -35,10 +106,6 @@ def get_video_url(page_url, premium=False, user="", password="", video_password= video_id = scrapertools.find_single_match(page_url, '(?:v=|embed/)([A-z0-9_-]{11})') video_urls = extract_videos(video_id) - video_urls.reverse() - - for video_url in video_urls: - logger.info(str(video_url)) return video_urls @@ -86,48 +153,37 @@ def extract_flashvars(data): return flashvars +def get_signature(youtube_page_data): + + from lib.jsinterpreter import JSInterpreter + + urljs = scrapertools.find_single_match(youtube_page_data, '"assets":.*?"js":\s*"([^"]+)"') + urljs = urljs.replace("\\", "") + if urljs: + if not re.search(r'https?://', urljs): + urljs = urlparse.urljoin("https://www.youtube.com", urljs) + data_js = httptools.downloadpage(urljs).data + + pattern = r'(?P\w+)=function\(\w+\){(\w)=\2\.split\(""\);.*?return\s+\2\.join\(""\)}' + + funcname = re.search(pattern, data_js).group('fname') + + jsi = JSInterpreter(data_js) + js_signature = jsi.extract_function(funcname) + + return js_signature + + def extract_videos(video_id): - fmt_value = { - 5: "240p h263 flv", - 6: "270p h263 flv", - 18: "360p h264 mp4", - 22: "720p h264 mp4", - 26: "???", - 33: "???", - 34: "360p h264 flv", - 35: "480p h264 flv", - 36: "3gpp", - 37: "1080p h264 mp4", - 38: "4K h264 mp4", - 43: "360p vp8 webm", - 44: "480p vp8 webm", - 45: "720p vp8 webm", - 46: "1080p vp8 webm", - 59: "480p h264 mp4", - 78: "480p h264 mp4", - 82: "360p h264 3D", - 83: "480p h264 3D", - 84: "720p h264 3D", - 85: "1080p h264 3D", - 100: "360p vp8 3D", - 101: "480p vp8 3D", - 102: "720p vp8 3D", - 91:"144 h264 mp4", - 92:"240 h264 mp4", - 93:"360 h264 mp4", - 94:"480 h264 mp4", - 95:"720 h264 mp4", - 96:"1080 h264 mp4", - 132:"240 h264 mp4", - 151:"72 h264 mp4" - } - # from core.support import dbg; dbg() + + url = 'https://www.youtube.com/get_video_info?video_id=%s&eurl=https://youtube.googleapis.com/v/%s&ssl_stream=1' % \ (video_id, video_id) data = httptools.downloadpage(url).data video_urls = [] params = dict(urlparse.parse_qsl(data)) + if params.get('hlsvp'): video_urls.append(["(LIVE .m3u8) [youtube]", params['hlsvp']]) return video_urls @@ -140,62 +196,27 @@ def extract_videos(video_id): if params.get('use_cipher_signature', '') != 'True': video_urls.append(['mpd HD [youtube]', params['dashmpd'], 0, '', True]) - js_signature = "" - youtube_page_data = httptools.downloadpage("http://www.youtube.com/watch?v=%s" % video_id).data + youtube_page_data = httptools.downloadpage("https://www.youtube.com/watch?v=%s" % video_id).data + params = extract_flashvars(youtube_page_data) - data_flashvars =[] - if params.get('adaptive_fmts'): - data_flashvars += scrapertools.find_multiple_matches(params['adaptive_fmts'], '(fps.*?url[^,]+)') - if params.get('url_encoded_fmt_stream_map'): - data_flashvars += params["url_encoded_fmt_stream_map"].split(",") - for url_desc in data_flashvars: - url_desc_map = dict(urlparse.parse_qsl(url_desc)) - if not url_desc_map.get("url") and not url_desc_map.get("stream"): - continue - try: - key = int(url_desc_map["itag"]) - if not fmt_value.get(key): - continue - - if url_desc_map.get("url"): - url = urllib.unquote(url_desc_map["url"]) - elif url_desc_map.get("conn") and url_desc_map.get("stream"): - url = urllib.unquote(url_desc_map["conn"]) - if url.rfind("/") < len(url) - 1: - url += "/" - url += urllib.unquote(url_desc_map["stream"]) - elif url_desc_map.get("stream") and not url_desc_map.get("conn"): - url = urllib.unquote(url_desc_map["stream"]) - - if url_desc_map.get("sig"): - url += "&signature=" + url_desc_map["sig"] - elif url_desc_map.get("s"): - sig = url_desc_map["s"] - if not js_signature: - urljs = scrapertools.find_single_match(youtube_page_data, '"assets":.*?"js":\s*"([^"]+)"') - urljs = urljs.replace("\\", "") - if urljs: - if not re.search(r'https?://', urljs): - urljs = urlparse.urljoin("https://www.youtube.com", urljs) - data_js = httptools.downloadpage(urljs).data - from jsinterpreter import JSInterpreter - funcname = scrapertools.find_single_match(data_js, '\.sig\|\|([A-z0-9$]+)\(') - if not funcname: - funcname = scrapertools.find_single_match(data_js, '["\']signature["\']\s*,\s*' - '([A-z0-9$]+)\(') - if not funcname: - funcname = scrapertools.find_single_match(data_js, r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(') - jsi = JSInterpreter(data_js) - js_signature = jsi.extract_function(funcname) - - - signature = js_signature([sig]) - url += "&sig=" + signature - url = url.replace(",", "%2C") - video_urls.append(["(" + fmt_value[key] + ") [youtube]", url]) - except: - import traceback - logger.info(traceback.format_exc()) + if params.get('player_response'): + params = json.load(params.get('player_response')) + data_flashvars = params["streamingData"] + for s_data in data_flashvars: + if s_data in ["adaptiveFormats", "formats"]: + for opt in data_flashvars[s_data]: + opt = dict(opt) + if "audioQuality" not in opt: + continue + if "cipher" in opt: + signature = get_signature(youtube_page_data) + cipher = dict(urlparse.parse_qsl(urllib.unquote(opt["cipher"]))) + url = re.search('url=(.*)', opt["cipher"]).group(1) + s = cipher.get('s') + url = "%s&sig=%s" % (urllib.unquote(url), signature([s])) + video_urls.append(["%s" % itag_list.get(opt["itag"], "video"), url]) + elif opt["itag"] in itag_list: + video_urls.append(["%s" % itag_list.get(opt["itag"], "video"), opt["url"]]) return video_urls