added localized strings and update unshortenit.py

2018-08-06 21:22:21 +02:00
parent 5f89b8ecb6
commit b3f4316a19
8 changed files with 476 additions and 71 deletions
@@ -12,11 +12,10 @@ import re
 import time
 import urllib
 from base64 import b64decode
-from platformcode import logger

-import xbmc

 from core import httptools
+from platformcode import config


 def find_in_text(regex, text, flags=re.IGNORECASE | re.DOTALL):
@@ -28,24 +27,30 @@ def find_in_text(regex, text, flags=re.IGNORECASE | re.DOTALL):


 class UnshortenIt(object):
-    _adfly_regex = r'adf\.ly|q\.gs|j\.gs|u\.bb|ay\.gy|threadsphere\.bid|restorecosm\.bid|clearload\.bid'
+    _adfly_regex = r'adf\.ly|j\.gs|q\.gs|u\.bb|ay\.gy|atominik\.com|tinyium\.com|microify\.com|threadsphere\.bid|clearload\.bid'
    _linkbucks_regex = r'linkbucks\.com|any\.gs|cash4links\.co|cash4files\.co|dyo\.gs|filesonthe\.net|goneviral\.com|megaline\.co|miniurls\.co|qqc\.co|seriousdeals\.net|theseblogs\.com|theseforums\.com|tinylinks\.co|tubeviral\.com|ultrafiles\.net|urlbeat\.net|whackyvidz\.com|yyv\.co'
    _adfocus_regex = r'adfoc\.us'
    _lnxlu_regex = r'lnx\.lu'
-    _shst_regex = r'sh\.st|gestyy\.com'
+    _shst_regex = r'sh\.st|festyy\.com|ceesty\.com'
    _hrefli_regex = r'href\.li'
    _anonymz_regex = r'anonymz\.com'
    _shrink_service_regex = r'shrink-service\.it'
    _rapidcrypt_regex = r'rapidcrypt\.net'
+
    _maxretries = 5
+
    _this_dir, _this_filename = os.path.split(__file__)
    _timeout = 10

    def unshorten(self, uri, type=None):
+
        domain = urlsplit(uri).netloc
+
        if not domain:
            return uri, "No domain found in URI!"
+
        had_google_outbound, uri = self._clear_google_outbound_proxy(uri)
+
        if re.search(self._adfly_regex, domain,
                     re.IGNORECASE) or type == 'adfly':
            return self._unshorten_adfly(uri)
@@ -68,15 +73,15 @@ class UnshortenIt(object):
            return self._unshorten_anonymz(uri)
        if re.search(self._rapidcrypt_regex, domain, re.IGNORECASE):
            return self._unshorten_rapidcrypt(uri)
+
        return uri, 200
+
    def unwrap_30x(self, uri, timeout=10):
+
        domain = urlsplit(uri).netloc
        self._timeout = timeout
-        loop_counter = 0
+
        try:
-            if loop_counter > 5:
-                raise ValueError("Infinitely looping redirect from URL: '%s'" %
-                                 (uri,))
            # headers stop t.co from working so omit headers if this is a t.co link
            if domain == 't.co':
                r = httptools.downloadpage(uri, timeout=self._timeout)
@@ -86,29 +91,28 @@ class UnshortenIt(object):
                r = httptools.downloadpage(uri, timeout=self._timeout)
                uri = re.findall(r'.*url\=(.*?)\"\.*', r.data)[0]
                return uri, r.code
-            else:

-                while True:
+            retries = 0
+            while True:
+                r = httptools.downloadpage(
+                    uri,
+                    timeout=self._timeout,
+                    follow_redirects=False)
+                if not r.sucess:
+                    return uri, -1
+
+                if 'location' in r.headers and retries < self._maxretries:
                    r = httptools.downloadpage(
-                        uri,
-                        timeout=self._timeout,
-                        follow_redirects=False,
-                        only_headers=True)
-                    if not r.success:
-                        return uri, -1
-                    retries = 0
-                    if 'location' in r.headers and retries < self._maxretries:
-                        r = httptools.downloadpage(
-                            r.headers['location'],
-                            follow_redirects=False,
-                            only_headers=True)
-                        uri = r.url
-                        loop_counter += 1
-                        retries = retries + 1
-                    else:
-                        return r.url, r.code
+                        r.headers['location'],
+                        follow_redirects=False)
+                    uri = r.url
+                    retries += 1
+                else:
+                    return r.url, r.code
+
        except Exception as e:
            return uri, str(e)
+
    def _clear_google_outbound_proxy(self, url):
        '''
        So google proxies all their outbound links through a redirect so they can detect outbound links.
@@ -117,13 +121,16 @@ class UnshortenIt(object):
        This is useful for doing things like parsing google search results, or if you're scraping google
        docs, where google inserts hit-counters on all outbound links.
        '''
+
        # This is kind of hacky, because we need to check both the netloc AND
        # part of the path. We could use urllib.parse.urlsplit, but it's
        # easier and just as effective to use string checks.
        if url.startswith("http://www.google.com/url?") or \
                url.startswith("https://www.google.com/url?"):
+
            qs = urlparse(url).query
            query = parse_qs(qs)
+
            if "q" in query:  # Google doc outbound links (maybe blogspot, too)
                return True, query["q"].pop()
            elif "url" in query:  # Outbound links from google searches
@@ -132,10 +139,11 @@ class UnshortenIt(object):
                raise ValueError(
                    "Google outbound proxy URL without a target url ('%s')?" %
                    url)
+
        return False, url

    def _unshorten_adfly(self, uri):
-        logger.info()
+
        try:
            r = httptools.downloadpage(
                uri, timeout=self._timeout, cookies=False)
@@ -144,11 +152,14 @@ class UnshortenIt(object):

            if len(ysmm) > 0:
                ysmm = re.sub(r'var ysmm \= \'|\'\;', '', ysmm[0])
+
                left = ''
                right = ''
+
                for c in [ysmm[i:i + 2] for i in range(0, len(ysmm), 2)]:
                    left += c[0]
                    right = c[1] + right
+
                # Additional digit arithmetic
                encoded_uri = list(left + right)
                numbers = ((i, n) for i, n in enumerate(encoded_uri) if str.isdigit(n))
@@ -156,9 +167,12 @@ class UnshortenIt(object):
                    xor = int(first[1]) ^ int(second[1])
                    if xor < 10:
                        encoded_uri[first[0]] = str(xor)
+
                decoded_uri = b64decode("".join(encoded_uri).encode())[16:-16].decode()
+
                if re.search(r'go\.php\?u\=', decoded_uri):
                    decoded_uri = b64decode(re.sub(r'(.*?)u=', '', decoded_uri)).decode()
+
                return decoded_uri, r.code
            else:
                return uri, 'No ysmm variable found'
@@ -170,15 +184,25 @@ class UnshortenIt(object):
        '''
        (Attempt) to decode linkbucks content. HEAVILY based on the OSS jDownloader codebase.
        This has necessidated a license change.
+
        '''
+        if config.is_xbmc():
+            import xbmc
+
        r = httptools.downloadpage(uri, timeout=self._timeout)
+
        firstGet = time.time()
+
        baseloc = r.url
+
        if "/notfound/" in r.url or \
                "(>Link Not Found<|>The link may have been deleted by the owner|To access the content, you must complete a quick survey\.)" in r.data:
            return uri, 'Error: Link not found or requires a survey!'
+
        link = None
+
        content = r.data
+
        regexes = [
            r"<div id=\"lb_header\">.*?/a>.*?<a.*?href=\"(.*?)\".*?class=\"lb",
            r"AdBriteInit\(\"(.*?)\"\)",
@@ -187,49 +211,69 @@ class UnshortenIt(object):
            r"src=\"http://static\.linkbucks\.com/tmpl/mint/img/lb\.gif\" /></a>.*?<a href=\"(.*?)\"",
            r"id=\"content\" src=\"([^\"]*)",
        ]
+
        for regex in regexes:
            if self.inValidate(link):
                link = find_in_text(regex, content)
+
        if self.inValidate(link):
            match = find_in_text(r"noresize=\"[0-9+]\" src=\"(http.*?)\"", content)
            if match:
                link = find_in_text(r"\"frame2\" frameborder.*?src=\"(.*?)\"", content)
+
        if self.inValidate(link):
            scripts = re.findall("(<script type=\"text/javascript\">[^<]+</script>)", content)
            if not scripts:
                return uri, "No script bodies found?"
+
            js = False
+
            for script in scripts:
                # cleanup
                script = re.sub(r"[\r\n\s]+\/\/\s*[^\r\n]+", "", script)
                if re.search(r"\s*var\s*f\s*=\s*window\['init'\s*\+\s*'Lb'\s*\+\s*'js'\s*\+\s*''\];[\r\n\s]+", script):
                    js = script
+
            if not js:
                return uri, "Could not find correct script?"
+
            token = find_in_text(r"Token\s*:\s*'([a-f0-9]{40})'", js)
            if not token:
                token = find_in_text(r"\?t=([a-f0-9]{40})", js)
+
            assert token
+
            authKeyMatchStr = r"A(?:'\s*\+\s*')?u(?:'\s*\+\s*')?t(?:'\s*\+\s*')?h(?:'\s*\+\s*')?K(?:'\s*\+\s*')?e(?:'\s*\+\s*')?y"
            l1 = find_in_text(r"\s*params\['" + authKeyMatchStr + r"'\]\s*=\s*(\d+?);", js)
            l2 = find_in_text(
                r"\s*params\['" + authKeyMatchStr + r"'\]\s*=\s?params\['" + authKeyMatchStr + r"'\]\s*\+\s*(\d+?);",
                js)
+
            if any([not l1, not l2, not token]):
                return uri, "Missing required tokens?"
+
            authkey = int(l1) + int(l2)
+
            p1_url = urljoin(baseloc, "/director/?t={tok}".format(tok=token))
            r2 = httptools.downloadpage(p1_url, timeout=self._timeout)
+
            p1_url = urljoin(baseloc, "/scripts/jquery.js?r={tok}&{key}".format(tok=token, key=l1))
-            r2_1 = httptools.downloadpage(p1_url, timeout=self._timeout)
+            r2 = httptools.downloadpage(p1_url, timeout=self._timeout)
+
            time_left = 5.033 - (time.time() - firstGet)
-            xbmc.sleep(max(time_left, 0) * 1000)
+            if config.is_xbmc():
+                xbmc.sleep(max(time_left, 0) * 1000)
+            else:
+                time.sleep(5 * 1000)
+
            p3_url = urljoin(baseloc, "/intermission/loadTargetUrl?t={tok}&aK={key}&a_b=false".format(tok=token,
                                                                                                      key=str(authkey)))
            r3 = httptools.downloadpage(p3_url, timeout=self._timeout)
+
            resp_json = json.loads(r3.data)
            if "Url" in resp_json:
                return resp_json['Url'], r3.code
+
        return "Wat", "wat"

    def inValidate(self, s):
@@ -237,23 +281,30 @@ class UnshortenIt(object):
        # (s == null || s != null && (s.matches("[\r\n\t ]+") || s.equals("") || s.equalsIgnoreCase("about:blank")))
        if not s:
            return True
+
        if re.search("[\r\n\t ]+", s) or s.lower() == "about:blank":
            return True
        else:
            return False
+
    def _unshorten_adfocus(self, uri):
        orig_uri = uri
        try:
+
            r = httptools.downloadpage(uri, timeout=self._timeout)
            html = r.data
+
            adlink = re.findall("click_url =.*;", html)
+
            if len(adlink) > 0:
                uri = re.sub('^click_url = "|"\;$', '', adlink[0])
                if re.search(r'http(s|)\://adfoc\.us/serve/skip/\?id\=', uri):
                    http_header = dict()
                    http_header["Host"] = "adfoc.us"
                    http_header["Referer"] = orig_uri
+
                    r = httptools.downloadpage(uri, headers=http_header, timeout=self._timeout)
+
                    uri = r.url
                return uri, r.code
            else:
@@ -283,16 +334,23 @@ class UnshortenIt(object):
        try:
            r = httptools.downloadpage(uri, timeout=self._timeout)
            html = r.data
+
            session_id = re.findall(r'sessionId\:(.*?)\"\,', html)
            if len(session_id) > 0:
                session_id = re.sub(r'\s\"', '', session_id[0])
+
                http_header = dict()
                http_header["Content-Type"] = "application/x-www-form-urlencoded"
                http_header["Host"] = "sh.st"
                http_header["Referer"] = uri
                http_header["Origin"] = "http://sh.st"
                http_header["X-Requested-With"] = "XMLHttpRequest"
-                xbmc.sleep(5 * 1000)
+                
+                if config.is_xbmc():
+                    xbmc.sleep(5 * 1000)
+                else:
+                    time.sleep(5 * 1000)
+
                payload = {'adSessionId': session_id, 'callback': 'c'}
                r = httptools.downloadpage(
                    'http://sh.st/shortest-url/end-adsession?' +
@@ -300,6 +358,7 @@ class UnshortenIt(object):
                    headers=http_header,
                    timeout=self._timeout)
                response = r.data[6:-2].decode('utf-8')
+
                if r.code == 200:
                    resp_uri = json.loads(response)['destinationUrl']
                    if resp_uri is not None:
@@ -325,8 +384,7 @@ class UnshortenIt(object):
            r = httptools.downloadpage(
                extracted_uri,
                timeout=self._timeout,
-                follow_redirects=False,
-                only_headers=True)
+                follow_redirects=False)
            return r.url, r.code
        except Exception as e:
            return uri, str(e)
@@ -339,9 +397,12 @@ class UnshortenIt(object):
        try:
            r = httptools.downloadpage(uri, timeout=self._timeout, cookies=False)
            html = r.data
+
            uri = re.findall(r"<input type='hidden' name='\d+' id='\d+' value='([^']+)'>", html)[0]
+
            from core import scrapertools
            uri = scrapertools.decodeHtmlentities(uri)
+
            uri = uri.replace("&sol;", "/") \
                .replace("&colon;", ":") \
                .replace("&period;", ".") \
@@ -349,6 +410,7 @@ class UnshortenIt(object):
                .replace("&num;", "#") \
                .replace("&quest;", "?") \
                .replace("&lowbar;", "_")
+
            return uri, r.code

        except Exception as e:
@@ -358,7 +420,9 @@ class UnshortenIt(object):
        try:
            r = httptools.downloadpage(uri, timeout=self._timeout, cookies=False)
            html = r.data
-            uri = re.findall(r'<a class="button" href="([^"]+)">Click to continue</a>', html)[0]
+
+            uri = re.findall(r'<a class="push_button blue" href=([^>]+)>', html)[0]
+
            return uri, r.code

        except Exception as e:
@@ -379,6 +443,7 @@ def unshorten_only(uri, type=None, timeout=10):

 def unshorten(uri, type=None, timeout=10):
    unshortener = UnshortenIt()
+    uri, status = unshortener.unwrap_30x(uri, timeout=timeout)
    uri, status = unshortener.unshorten(uri, type=type)
    if status == 200:
        uri, status = unshortener.unwrap_30x(uri, timeout=timeout)