diff --git a/plugin.video.alfa/lib/unshortenit.py b/plugin.video.alfa/lib/unshortenit.py index 79631c65..f8591eed 100755 --- a/plugin.video.alfa/lib/unshortenit.py +++ b/plugin.video.alfa/lib/unshortenit.py @@ -1,227 +1,454 @@ -# -*- coding: utf-8 -*- - -try: - from selenium.webdriver import PhantomJS - from contextlib import closing - - linkbucks_support = True -except: - linkbucks_support = False -try: - from urllib.request import urlsplit, urlparse -except: - from urlparse import urlsplit, urlparse -import json -import os -import re -import time -from base64 import b64decode - -import requests - - -class UnshortenIt(object): - _headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Encoding': 'gzip,deflate,sdch', - 'Accept-Language': 'en-US,en;q=0.8', - 'Connection': 'keep-alive', - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36'} - _adfly_regex = r'adf\.ly|q\.gs|j\.gs|u\.bb|ay\.gy' - _linkbucks_regex = r'linkbucks\.com|any\.gs|cash4links\.co|cash4files\.co|dyo\.gs|filesonthe\.net|goneviral\.com|megaline\.co|miniurls\.co|qqc\.co|seriousdeals\.net|theseblogs\.com|theseforums\.com|tinylinks\.co|tubeviral\.com|ultrafiles\.net|urlbeat\.net|whackyvidz\.com|yyv\.co' - _adfocus_regex = r'adfoc\.us' - _lnxlu_regex = r'lnx\.lu' - _shst_regex = r'sh\.st' - _this_dir, _this_filename = os.path.split(__file__) - _timeout = 10 - - def unshorten(self, uri, type=None, timeout=10): - domain = urlsplit(uri).netloc - self._timeout = timeout - - if re.search(self._adfly_regex, domain, re.IGNORECASE) or type == 'adfly': - return self._unshorten_adfly(uri) - if re.search(self._adfocus_regex, domain, re.IGNORECASE) or type == 'adfocus': - return self._unshorten_adfocus(uri) - if re.search(self._linkbucks_regex, domain, re.IGNORECASE) or type == 'linkbucks': - if linkbucks_support: - return self._unshorten_linkbucks(uri) - else: - return uri, 'linkbucks.com not supported. Install selenium package to add support.' - if re.search(self._lnxlu_regex, domain, re.IGNORECASE) or type == 'lnxlu': - return self._unshorten_lnxlu(uri) - if re.search(self._shst_regex, domain, re.IGNORECASE): - return self._unshorten_shst(uri) - - try: - # headers stop t.co from working so omit headers if this is a t.co link - if domain == 't.co': - r = requests.get(uri, timeout=self._timeout) - return r.url, r.status_code - # p.ost.im uses meta http refresh to redirect. - if domain == 'p.ost.im': - r = requests.get(uri, headers=self._headers, timeout=self._timeout) - uri = re.findall(r'.*url\=(.*?)\"\.*', r.text)[0] - return uri, 200 - r = requests.head(uri, headers=self._headers, timeout=self._timeout) - while True: - if 'location' in r.headers: - r = requests.head(r.headers['location']) - uri = r.url - else: - return r.url, r.status_code - - except Exception as e: - return uri, str(e) - - def _unshorten_adfly(self, uri): - - try: - r = requests.get(uri, headers=self._headers, timeout=self._timeout) - html = r.text - ysmm = re.findall(r"var ysmm =.*\;?", html) - - if len(ysmm) > 0: - ysmm = re.sub(r'var ysmm \= \'|\'\;', '', ysmm[0]) - - left = '' - right = '' - - for c in [ysmm[i:i + 2] for i in range(0, len(ysmm), 2)]: - left += c[0] - right = c[1] + right - - decoded_uri = b64decode(left.encode() + right.encode())[2:].decode() - - if re.search(r'go\.php\?u\=', decoded_uri): - decoded_uri = b64decode(re.sub(r'(.*?)u=', '', decoded_uri)).decode() - - return decoded_uri, r.status_code - else: - return uri, 'No ysmm variable found' - - except Exception as e: - return uri, str(e) - - def _unshorten_linkbucks(self, uri): - try: - with closing(PhantomJS( - service_log_path=os.path.dirname(os.path.realpath(__file__)) + '/ghostdriver.log')) as browser: - browser.get(uri) - - # wait 5 seconds - time.sleep(5) - - page_source = browser.page_source - - link = re.findall(r'skiplink(.*?)\>', page_source) - if link is not None: - link = re.sub(r'\shref\=|\"', '', link[0]) - if link == '': - return uri, 'Failed to extract link.' - return link, 200 - else: - return uri, 'Failed to extract link.' - - except Exception as e: - return uri, str(e) - - def _unshorten_adfocus(self, uri): - orig_uri = uri - try: - http_header = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.46 Safari/535.11", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "nl-NL,nl;q=0.8,en-US;q=0.6,en;q=0.4", - "Cache-Control": "no-cache", - "Pragma": "no-cache" - } - - r = requests.get(uri, headers=http_header, timeout=self._timeout) - html = r.text - - adlink = re.findall("click_url =.*;", html) - - if len(adlink) > 0: - uri = re.sub('^click_url = "|"\;$', '', adlink[0]) - if re.search(r'http(s|)\://adfoc\.us/serve/skip/\?id\=', uri): - http_header = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.46 Safari/535.11", - "Accept-Encoding": "gzip,deflate,sdch", - "Accept-Language": "en-US,en;,q=0.8", - "Connection": "keep-alive", - "Host": "adfoc.us", - "Cache-Control": "no-cache", - "Pragma": "no-cache", - "Referer": orig_uri, - } - r = requests.get(uri, headers=http_header, timeout=self._timeout) - - uri = r.url - return uri, r.status_code - else: - return uri, 'No click_url variable found' - except Exception as e: - return uri, str(e) - - def _unshorten_lnxlu(self, uri): - try: - r = requests.get(uri, headers=self._headers, timeout=self._timeout) - html = r.text - - code = re.findall('/\?click\=(.*)\."', html) - - if len(code) > 0: - payload = {'click': code[0]} - r = requests.get('http://lnx.lu/', params=payload, headers=self._headers, timeout=self._timeout) - return r.url, r.status_code - else: - return uri, 'No click variable found' - except Exception as e: - return uri, str(e) - - def _unshorten_shst(self, uri): - try: - r = requests.get(uri, headers=self._headers, timeout=self._timeout) - html = r.text - - session_id = re.findall(r'sessionId\:(.*?)\"\,', html) - if len(session_id) > 0: - session_id = re.sub(r'\s\"', '', session_id[0]) - - http_header = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.46 Safari/535.11", - "Accept-Encoding": "gzip,deflate,sdch", - "Accept-Language": "en-US,en;,q=0.8", - "Connection": "keep-alive", - "Content-Type": "application/x-www-form-urlencoded", - "Host": "sh.st", - "Referer": uri, - "Origin": "http://sh.st", - "X-Requested-With": "XMLHttpRequest" - } - - time.sleep(5) - - payload = {'adSessionId': session_id, 'callback': 'c'} - r = requests.get('http://sh.st/shortest-url/end-adsession', params=payload, headers=http_header, - timeout=self._timeout) - response = r.content[6:-2].decode('utf-8') - - if r.status_code == 200: - resp_uri = json.loads(response)['destinationUrl'] - if resp_uri is not None: - uri = resp_uri - else: - return uri, 'Error extracting url' - else: - return uri, 'Error extracting url' - - return uri, r.status_code - - except Exception as e: - return uri, str(e) - - -def unshorten(uri, type=None, timeout=10): - unshortener = UnshortenIt() - return unshortener.unshorten(uri, type, timeout) +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +try: + from urllib.parse import urlsplit, urlparse, parse_qs, urljoin +except: + from urlparse import urlsplit, urlparse, parse_qs, urljoin + +import json +import os +import re +import time +import urllib +from base64 import b64decode +from platformcode import logger + +import xbmc + +from core import httptools + + +def find_in_text(regex, text, flags=re.IGNORECASE | re.DOTALL): + rec = re.compile(regex, flags=flags) + match = rec.search(text) + if not match: + return False + return match.group(1) + + +class UnshortenIt(object): + _adfly_regex = r'adf\.ly|q\.gs|j\.gs|u\.bb|ay\.gy|threadsphere\.bid|restorecosm\.bid' + _linkbucks_regex = r'linkbucks\.com|any\.gs|cash4links\.co|cash4files\.co|dyo\.gs|filesonthe\.net|goneviral\.com|megaline\.co|miniurls\.co|qqc\.co|seriousdeals\.net|theseblogs\.com|theseforums\.com|tinylinks\.co|tubeviral\.com|ultrafiles\.net|urlbeat\.net|whackyvidz\.com|yyv\.co' + _adfocus_regex = r'adfoc\.us' + _lnxlu_regex = r'lnx\.lu' + _shst_regex = r'sh\.st' + _hrefli_regex = r'href\.li' + _anonymz_regex = r'anonymz\.com' + _shrink_service_regex = r'shrink-service\.it' + _rapidcrypt_regex = r'rapidcrypt\.net' + + _maxretries = 5 + + _this_dir, _this_filename = os.path.split(__file__) + _timeout = 10 + + def unshorten(self, uri, type=None): + + domain = urlsplit(uri).netloc + + if not domain: + return uri, "No domain found in URI!" + + had_google_outbound, uri = self._clear_google_outbound_proxy(uri) + + if re.search(self._adfly_regex, domain, + re.IGNORECASE) or type == 'adfly': + return self._unshorten_adfly(uri) + if re.search(self._adfocus_regex, domain, + re.IGNORECASE) or type == 'adfocus': + return self._unshorten_adfocus(uri) + if re.search(self._linkbucks_regex, domain, + re.IGNORECASE) or type == 'linkbucks': + return self._unshorten_linkbucks(uri) + if re.search(self._lnxlu_regex, domain, + re.IGNORECASE) or type == 'lnxlu': + return self._unshorten_lnxlu(uri) + if re.search(self._shrink_service_regex, domain, re.IGNORECASE): + return self._unshorten_shrink_service(uri) + if re.search(self._shst_regex, domain, re.IGNORECASE): + return self._unshorten_shst(uri) + if re.search(self._hrefli_regex, domain, re.IGNORECASE): + return self._unshorten_hrefli(uri) + if re.search(self._anonymz_regex, domain, re.IGNORECASE): + return self._unshorten_anonymz(uri) + if re.search(self._rapidcrypt_regex, domain, re.IGNORECASE): + return self._unshorten_rapidcrypt(uri) + + return uri, 200 + + def unwrap_30x(self, uri, timeout=10): + + domain = urlsplit(uri).netloc + self._timeout = timeout + + loop_counter = 0 + try: + + if loop_counter > 5: + raise ValueError("Infinitely looping redirect from URL: '%s'" % + (uri,)) + + # headers stop t.co from working so omit headers if this is a t.co link + if domain == 't.co': + r = httptools.downloadpage(uri, timeout=self._timeout) + return r.url, r.code + # p.ost.im uses meta http refresh to redirect. + if domain == 'p.ost.im': + r = httptools.downloadpage(uri, timeout=self._timeout) + uri = re.findall(r'.*url\=(.*?)\"\.*', r.data)[0] + return uri, r.code + else: + + while True: + r = httptools.downloadpage( + uri, + timeout=self._timeout, + follow_redirects=False, + only_headers=True) + if not r.success: + return uri, -1 + + retries = 0 + if 'location' in r.headers and retries < self._maxretries: + r = httptools.downloadpage( + r.headers['location'], + follow_redirects=False, + only_headers=True) + uri = r.url + loop_counter += 1 + retries = retries + 1 + else: + return r.url, r.code + + except Exception as e: + return uri, str(e) + + def _clear_google_outbound_proxy(self, url): + ''' + So google proxies all their outbound links through a redirect so they can detect outbound links. + This call strips them out if they are present. + + This is useful for doing things like parsing google search results, or if you're scraping google + docs, where google inserts hit-counters on all outbound links. + ''' + + # This is kind of hacky, because we need to check both the netloc AND + # part of the path. We could use urllib.parse.urlsplit, but it's + # easier and just as effective to use string checks. + if url.startswith("http://www.google.com/url?") or \ + url.startswith("https://www.google.com/url?"): + + qs = urlparse(url).query + query = parse_qs(qs) + + if "q" in query: # Google doc outbound links (maybe blogspot, too) + return True, query["q"].pop() + elif "url" in query: # Outbound links from google searches + return True, query["url"].pop() + else: + raise ValueError( + "Google outbound proxy URL without a target url ('%s')?" % + url) + + return False, url + + def _unshorten_adfly(self, uri): + logger.info() + try: + r = httptools.downloadpage( + uri, timeout=self._timeout, cookies=False) + html = r.data + logger.info("Intel33 %s" %html) + ysmm = re.findall(r"var ysmm =.*\;?", html) + + if len(ysmm) > 0: + ysmm = re.sub(r'var ysmm \= \'|\'\;', '', ysmm[0]) + + left = '' + right = '' + + for c in [ysmm[i:i + 2] for i in range(0, len(ysmm), 2)]: + left += c[0] + right = c[1] + right + + # Additional digit arithmetic + encoded_uri = list(left + right) + numbers = ((i, n) for i, n in enumerate(encoded_uri) if str.isdigit(n)) + for first, second in zip(numbers, numbers): + xor = int(first[1]) ^ int(second[1]) + if xor < 10: + encoded_uri[first[0]] = str(xor) + + decoded_uri = b64decode("".join(encoded_uri).encode())[16:-16].decode() + + if re.search(r'go\.php\?u\=', decoded_uri): + decoded_uri = b64decode(re.sub(r'(.*?)u=', '', decoded_uri)).decode() + + return decoded_uri, r.code + else: + return uri, 'No ysmm variable found' + + except Exception as e: + return uri, str(e) + + def _unshorten_linkbucks(self, uri): + ''' + (Attempt) to decode linkbucks content. HEAVILY based on the OSS jDownloader codebase. + This has necessidated a license change. + + ''' + + r = httptools.downloadpage(uri, timeout=self._timeout) + + firstGet = time.time() + + baseloc = r.url + + if "/notfound/" in r.url or \ + "(>Link Not Found<|>The link may have been deleted by the owner|To access the content, you must complete a quick survey\.)" in r.data: + return uri, 'Error: Link not found or requires a survey!' + + link = None + + content = r.data + + regexes = [ + r"
.*?/a>.*?.*?[^<]+)", content) + if not scripts: + return uri, "No script bodies found?" + + js = False + + for script in scripts: + # cleanup + script = re.sub(r"[\r\n\s]+\/\/\s*[^\r\n]+", "", script) + if re.search(r"\s*var\s*f\s*=\s*window\['init'\s*\+\s*'Lb'\s*\+\s*'js'\s*\+\s*''\];[\r\n\s]+", script): + js = script + + if not js: + return uri, "Could not find correct script?" + + token = find_in_text(r"Token\s*:\s*'([a-f0-9]{40})'", js) + if not token: + token = find_in_text(r"\?t=([a-f0-9]{40})", js) + + assert token + + authKeyMatchStr = r"A(?:'\s*\+\s*')?u(?:'\s*\+\s*')?t(?:'\s*\+\s*')?h(?:'\s*\+\s*')?K(?:'\s*\+\s*')?e(?:'\s*\+\s*')?y" + l1 = find_in_text(r"\s*params\['" + authKeyMatchStr + r"'\]\s*=\s*(\d+?);", js) + l2 = find_in_text( + r"\s*params\['" + authKeyMatchStr + r"'\]\s*=\s?params\['" + authKeyMatchStr + r"'\]\s*\+\s*(\d+?);", + js) + + if any([not l1, not l2, not token]): + return uri, "Missing required tokens?" + + authkey = int(l1) + int(l2) + + p1_url = urljoin(baseloc, "/director/?t={tok}".format(tok=token)) + r2 = httptools.downloadpage(p1_url, timeout=self._timeout) + + p1_url = urljoin(baseloc, "/scripts/jquery.js?r={tok}&{key}".format(tok=token, key=l1)) + r2_1 = httptools.downloadpage(p1_url, timeout=self._timeout) + + time_left = 5.033 - (time.time() - firstGet) + xbmc.sleep(max(time_left, 0) * 1000) + + p3_url = urljoin(baseloc, "/intermission/loadTargetUrl?t={tok}&aK={key}&a_b=false".format(tok=token, + key=str(authkey))) + r3 = httptools.downloadpage(p3_url, timeout=self._timeout) + + resp_json = json.loads(r3.data) + if "Url" in resp_json: + return resp_json['Url'], r3.code + + return "Wat", "wat" + + def inValidate(self, s): + # Original conditional: + # (s == null || s != null && (s.matches("[\r\n\t ]+") || s.equals("") || s.equalsIgnoreCase("about:blank"))) + if not s: + return True + + if re.search("[\r\n\t ]+", s) or s.lower() == "about:blank": + return True + else: + return False + + def _unshorten_adfocus(self, uri): + orig_uri = uri + try: + + r = httptools.downloadpage(uri, timeout=self._timeout) + html = r.data + + adlink = re.findall("click_url =.*;", html) + + if len(adlink) > 0: + uri = re.sub('^click_url = "|"\;$', '', adlink[0]) + if re.search(r'http(s|)\://adfoc\.us/serve/skip/\?id\=', uri): + http_header = dict() + http_header["Host"] = "adfoc.us" + http_header["Referer"] = orig_uri + + r = httptools.downloadpage(uri, headers=http_header, timeout=self._timeout) + + uri = r.url + return uri, r.code + else: + return uri, 'No click_url variable found' + except Exception as e: + return uri, str(e) + + def _unshorten_lnxlu(self, uri): + try: + r = httptools.downloadpage(uri, timeout=self._timeout) + html = r.data + + code = re.findall('/\?click\=(.*)\."', html) + + if len(code) > 0: + payload = {'click': code[0]} + r = httptools.downloadpage( + 'http://lnx.lu?' + urllib.urlencode(payload), + timeout=self._timeout) + return r.url, r.code + else: + return uri, 'No click variable found' + except Exception as e: + return uri, str(e) + + def _unshorten_shst(self, uri): + try: + r = httptools.downloadpage(uri, timeout=self._timeout) + html = r.data + + session_id = re.findall(r'sessionId\:(.*?)\"\,', html) + if len(session_id) > 0: + session_id = re.sub(r'\s\"', '', session_id[0]) + + http_header = dict() + http_header["Content-Type"] = "application/x-www-form-urlencoded" + http_header["Host"] = "sh.st" + http_header["Referer"] = uri + http_header["Origin"] = "http://sh.st" + http_header["X-Requested-With"] = "XMLHttpRequest" + + xbmc.sleep(5 * 1000) + + payload = {'adSessionId': session_id, 'callback': 'c'} + r = httptools.downloadpage( + 'http://sh.st/shortest-url/end-adsession?' + + urllib.urlencode(payload), + headers=http_header, + timeout=self._timeout) + response = r.data[6:-2].decode('utf-8') + + if r.code == 200: + resp_uri = json.loads(response)['destinationUrl'] + if resp_uri is not None: + uri = resp_uri + else: + return uri, 'Error extracting url' + else: + return uri, 'Error extracting url' + + return uri, r.code + + except Exception as e: + return uri, str(e) + + def _unshorten_hrefli(self, uri): + try: + # Extract url from query + parsed_uri = urlparse(uri) + extracted_uri = parsed_uri.query + if not extracted_uri: + return uri, 200 + # Get url status code + r = httptools.downloadpage( + extracted_uri, + timeout=self._timeout, + follow_redirects=False, + only_headers=True) + return r.url, r.code + except Exception as e: + return uri, str(e) + + def _unshorten_anonymz(self, uri): + # For the moment they use the same system as hrefli + return self._unshorten_hrefli(uri) + + def _unshorten_shrink_service(self, uri): + try: + r = httptools.downloadpage(uri, timeout=self._timeout, cookies=False) + html = r.data + + uri = re.findall(r"", html)[0] + + from core import scrapertools + uri = scrapertools.decodeHtmlentities(uri) + + uri = uri.replace("/", "/") \ + .replace(":", ":") \ + .replace(".", ".") \ + .replace("!", "!") \ + .replace("#", "#") \ + .replace("?", "?") \ + .replace("_", "_") + + return uri, r.code + + except Exception as e: + return uri, str(e) + + def _unshorten_rapidcrypt(self, uri): + try: + r = httptools.downloadpage(uri, timeout=self._timeout, cookies=False) + html = r.data + + uri = re.findall(r'Click to continue', html)[0] + + return uri, r.code + + except Exception as e: + return uri, str(e) + + +def unwrap_30x_only(uri, timeout=10): + unshortener = UnshortenIt() + uri, status = unshortener.unwrap_30x(uri, timeout=timeout) + return uri, status + + +def unshorten_only(uri, type=None, timeout=10): + unshortener = UnshortenIt() + uri, status = unshortener.unshorten(uri, type=type) + return uri, status + + +def unshorten(uri, type=None, timeout=10): + unshortener = UnshortenIt() + uri, status = unshortener.unshorten(uri, type=type) + if status == 200: + uri, status = unshortener.unwrap_30x(uri, timeout=timeout) + return uri, status