Merge branch 'master' into master

2019-04-11 21:29:25 +02:00
parent ec3dca1022 4ae99fc44c
commit a9090a1ef3
44 changed files with 1735 additions and 330 deletions
@@ -10,7 +10,6 @@ import urlparse

 from platformcode import logger
 from decimal import Decimal
-from js2py.internals import seval


 class Cloudflare:
@@ -47,25 +46,50 @@ class Cloudflare:
                logger.debug("Metodo #2 (headers): NO disponible")
                self.header_data = {}

-
    def solve_cf(self, body, domain):
-        k = re.compile('<div style="display:none;visibility:hidden;" id=".*?">(.*?)<\/div>', re.DOTALL).findall(body)
-        k1 = re.compile('function\(p\){var p = eval\(eval.*?atob.*?return \+\(p\)}\(\)', re.DOTALL).findall(body)
-        if k1:
-            body = body.replace(k1[0], k[0])
-        js = re.search(r"setTimeout\(function\(\){\s+(var "
-                    "s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n", body).group(1)
+        js = re.search(
+            r"setTimeout\(function\(\){\s+(var s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n",
+            body
+        ).group(1)
+
        js = re.sub(r"a\.value = ((.+).toFixed\(10\))?", r"\1", js)
+        js = re.sub(r'(e\s=\sfunction\(s\)\s{.*?};)', '', js, flags=re.DOTALL|re.MULTILINE)
        js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js).replace("t.length", str(len(domain)))
        js = js.replace('; 121', '')
-        reemplazar = re.compile('(?is)function\(p\)\{return eval.*?\+p\+"\)"\)}', re.DOTALL).findall(js)
-        if reemplazar:
-            js = js.replace(reemplazar[0],'t.charCodeAt')
        js = re.sub(r"[\n\\']", "", js)
-        js = 'a = {{}}; t = "{}";{}'.format(domain, js)
-        result = seval.eval_js_vm(js)
+        jsEnv = """
+        var t = "{domain}";
+        var g = String.fromCharCode;
+        o = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
+        e = function(s) {{
+            s += "==".slice(2 - (s.length & 3));
+            var bm, r = "", r1, r2, i = 0;
+            for (; i < s.length;) {{
+                bm = o.indexOf(s.charAt(i++)) << 18 | o.indexOf(s.charAt(i++)) << 12 | (r1 = o.indexOf(s.charAt(i++))) << 6 | (r2 = o.indexOf(s.charAt(i++)));
+                r += r1 === 64 ? g(bm >> 16 & 255) : r2 === 64 ? g(bm >> 16 & 255, bm >> 8 & 255) : g(bm >> 16 & 255, bm >> 8 & 255, bm & 255);
+            }}
+            return r;
+        }};
+        function italics (str) {{ return '<i>' + this + '</i>'; }};
+        var document = {{
+            getElementById: function () {{
+                return {{'innerHTML': '{innerHTML}'}};
+            }}
+        }};
+        {js}
+        """
+        innerHTML = re.search('<div(?: [^<>]*)? id="([^<>]*?)">([^<>]*?)<\/div>', body , re.MULTILINE | re.DOTALL)
+        innerHTML = innerHTML.group(2).replace("'", r"\'") if innerHTML else ""
+        import js2py
+        from jsc import jsunc
+        js = jsunc(jsEnv.format(domain=domain, innerHTML=innerHTML, js=js))
+        def atob(s):
+            return base64.b64decode('{}'.format(s)).decode('utf-8')
+        js2py.disable_pyimport()
+        context = js2py.EvalJs({'atob': atob})
+        result = context.eval(js)
        return float(result)
-        
+

    @property
    def wait_time(self):
@@ -32,7 +32,6 @@ import urlparse
 from StringIO import StringIO
 from threading import Lock

-from core.cloudflare import Cloudflare
 from platformcode import config, logger
 from platformcode.logger import WebErrorException

@@ -76,21 +75,21 @@ def get_url_headers(url):
    return url + "|" + "&".join(["%s=%s" % (h, headers[h]) for h in headers])


-def load_cookies():
+def load_cookies(alfa_s=False):
    cookies_lock.acquire()
    if os.path.isfile(ficherocookies):
-        logger.info("Leyendo fichero cookies")
+        if not alfa_s: logger.info("Leyendo fichero cookies")
        try:
            cj.load(ficherocookies, ignore_discard=True)
        except:
-            logger.info("El fichero de cookies existe pero es ilegible, se borra")
+            if not alfa_s: logger.info("El fichero de cookies existe pero es ilegible, se borra")
            os.remove(ficherocookies)
    cookies_lock.release()


-def save_cookies():
+def save_cookies(alfa_s=False):
    cookies_lock.acquire()
-    logger.info("Guardando cookies...")
+    if not alfa_s: logger.info("Guardando cookies...")
    cj.save(ficherocookies, ignore_discard=True)
    cookies_lock.release()

@@ -99,7 +98,7 @@ load_cookies()


 def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=True, cookies=True, replace_headers=False,
-                 add_referer=False, only_headers=False, bypass_cloudflare=True, count_retries=0, count_retries_tot=5, random_headers=False, ignore_response_code=False, alfa_s=False, proxy=True, proxy_web=False, forced_proxy=None, proxy_retries=1):
+                 add_referer=False, only_headers=False, bypass_cloudflare=True, count_retries=0, count_retries_tot=5, random_headers=False, ignore_response_code=False, alfa_s=False, proxy=True, proxy_web=False, proxy_addr_forced=None,forced_proxy=None, proxy_retries=1):
    """
    Abre una url y retorna los datos obtenidos

@@ -174,19 +173,28 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
        proxy_CF_addr = ''
        proxy_web_name = ''
        proxy_log = ''
-        import proxytools
        
        try:
-            if (proxy or proxy_web) and (forced_proxy or proxytools.channel_proxy_list(url, forced_proxy=forced_proxy)):
+            if (proxy or proxy_web) and (forced_proxy or proxy_addr_forced or channel_proxy_list(url, forced_proxy=forced_proxy)):
+                import proxytools
                proxy_addr, proxy_CF_addr, proxy_web_name, proxy_log = proxytools.get_proxy_addr(url, post=post, forced_proxy=forced_proxy)
+                if proxy_addr_forced and proxy_log:
+                    import scrapertools
+                    proxy_log = scrapertools.find_single_match(str(proxy_addr_forced), "{'http.*':\s*'(.*?)'}")
            
                if proxy and proxy_addr:
+                    if proxy_addr_forced: proxy_addr = proxy_addr_forced
                    handlers.append(urllib2.ProxyHandler(proxy_addr))
                    proxy_stat = ', Proxy Direct ' + proxy_log
                elif proxy and proxy_CF_addr:
+                    if proxy_addr_forced: proxy_CF_addr = proxy_addr_forced
                    handlers.append(urllib2.ProxyHandler(proxy_CF_addr))
                    proxy_stat = ', Proxy CF ' + proxy_log
-                elif proxy and not proxy_addr and not proxy_CF_addr:
+                elif proxy and proxy_addr_forced:
+                    proxy_addr = proxy_addr_forced
+                    handlers.append(urllib2.ProxyHandler(proxy_addr))
+                    proxy_stat = ', Proxy Direct ' + proxy_log
+                elif proxy and not proxy_addr and not proxy_CF_addr and not proxy_addr_forced:
                    proxy = False
                    if not proxy_web_name:
                        proxy_addr, proxy_CF_addr, proxy_web_name, proxy_log = proxytools.get_proxy_addr(url, forced_proxy='Total')
@@ -335,7 +343,7 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
                raise WebErrorException(urlparse.urlparse(url)[1])

        if cookies:
-            save_cookies()
+            save_cookies(alfa_s=alfa_s)

        if not alfa_s:
            logger.info("Encoding: %s" % (response["headers"].get('content-encoding')))
@@ -362,6 +370,7 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr

        # Anti Cloudflare
        if bypass_cloudflare and count_retries < count_retries_tot:
+            from core.cloudflare import Cloudflare
            cf = Cloudflare(response)
            if cf.is_cloudflare:
                count_retries += 1
@@ -370,15 +379,15 @@ def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=Tr
                auth_url = cf.get_url()
                if not alfa_s:
                    logger.info("Autorizando... intento %d url: %s" % (count_retries, auth_url))
-                tt = downloadpage(auth_url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web)
+                tt = downloadpage(auth_url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web, forced_proxy=forced_proxy, proxy_addr_forced=proxy_addr_forced, alfa_s=alfa_s)
                if tt.code == 403:
-                    tt = downloadpage(url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web)
+                    tt = downloadpage(url, headers=request_headers, replace_headers=True, count_retries=count_retries, ignore_response_code=True, count_retries_tot=count_retries_tot, proxy=proxy, proxy_web=proxy_web, forced_proxy=forced_proxy, proxy_addr_forced=proxy_addr_forced, alfa_s=alfa_s)
                if tt.sucess:
                    if not alfa_s:
                        logger.info("Autorización correcta, descargando página")
                    resp = downloadpage(url=response["url"], post=post, headers=headers, timeout=timeout,
                                        follow_redirects=follow_redirects, count_retries=count_retries, 
-                                        cookies=cookies, replace_headers=replace_headers, add_referer=add_referer, proxy=proxy, proxy_web=proxy_web, count_retries_tot=count_retries_tot)
+                                        cookies=cookies, replace_headers=replace_headers, add_referer=add_referer, proxy=proxy, proxy_web=proxy_web, count_retries_tot=count_retries_tot, forced_proxy=forced_proxy, proxy_addr_forced=proxy_addr_forced, alfa_s=alfa_s)
                    response["sucess"] = resp.sucess
                    response["code"] = resp.code
                    response["error"] = resp.error
@@ -435,6 +444,30 @@ def random_useragent():
            return UserAgentIem
    
    return default_headers["User-Agent"]
+    
+    
+def channel_proxy_list(url, forced_proxy=None):
+    import base64
+    import ast
+    import scrapertools
+    
+    try:
+        proxy_channel_bloqued_str = base64.b64decode(config.get_setting('proxy_channel_bloqued')).decode('utf-8')
+        proxy_channel_bloqued = dict()
+        proxy_channel_bloqued = ast.literal_eval(proxy_channel_bloqued_str)
+    except:
+        logger.debug('Proxytools no inicializado correctamente')
+        return False
+
+    if not url.endswith('/'):
+        url += '/'
+    if scrapertools.find_single_match(url, '(?:http.*:\/\/)?([^\?|\/]+)(?:\?|\/)') in proxy_channel_bloqued:
+        if forced_proxy:
+            return True
+        if 'ON' in proxy_channel_bloqued[scrapertools.find_single_match(url, '(?:http.*:\/\/)?([^\?|\/]+)(?:\?|\/)')]:
+            return True
+    
+    return False


 class NoRedirectHandler(urllib2.HTTPRedirectHandler):
@@ -10,25 +10,6 @@ from core import httptools
 from platformcode import logger


-def downloadpage(url, post=None, headers=None, follow_redirects=True, timeout=None, header_to_get=None):
-    response = httptools.downloadpage(url, post=post, headers=headers, follow_redirects=follow_redirects,
-                                      timeout=timeout)
-    if header_to_get:
-        return response.headers.get(header_to_get)
-    else:
-        return response.data
-
-
-def downloadpageGzip(url):
-    response = httptools.downloadpage(url, add_referer=True)
-    return response.data
-
-
-def getLocationHeaderFromResponse(url):
-    response = httptools.downloadpage(url, only_headers=True)
-    return response.headers.get("location")
-
-
 def get_header_from_response(url, header_to_get="", post=None, headers=None):
    header_to_get = header_to_get.lower()
    response = httptools.downloadpage(url, post=post, headers=headers, only_headers=True)
@@ -48,11 +29,6 @@ def printMatches(matches):
        i = i + 1


-def get_match(data, patron, index=0):
-    matches = re.findall(patron, data, flags=re.DOTALL)
-    return matches[index]
-
-
 def find_single_match(data, patron, index=0):
    try:
        matches = re.findall(patron, data, flags=re.DOTALL)
@@ -18,10 +18,6 @@ def printMatches(matches):
        i = i + 1


-def get_match(data, patron, index=0):
-    return find_single_match(data, patron, index=0)
-
-
 def find_single_match(data, patron, index=0):
    try:
        matches = re.findall(patron, data, flags=re.DOTALL)