newScrape

2019-07-19 18:38:52 +02:00
parent fbe8c16f6c
commit 4107dbd5ae
11 changed files with 654 additions and 789 deletions
@@ -92,172 +92,202 @@ def url_decode(url_enc):
 def color(text, color):
    return "[COLOR " + color + "]" + text + "[/COLOR]"

-
-def scrape(item, patron = '', listGroups = [], headers="", blacklist="", data="", patron_block="",
-           patronNext="", action="findvideos", addVideolibrary = True, type_content_dict={}, type_action_dict={}):
+def scrape(func):
+    # args is a dict containing the foolowing keys:
    # patron: the patron to use for scraping page, all capturing group must match with listGroups
    # listGroups: a list containing the scraping info obtained by your patron, in order
    # accepted values are: url, title, thumb, quality, year, plot, duration, genre, rating, episode, lang

-    # header: values to pass to request header
+    # headers: values to pass to request header
    # blacklist: titles that you want to exclude(service articles for example)
    # data: if you want to pass data manually, maybe because you need some custom replacement
-    # patron_block: patron to get parts of the page (to scrape with patron attribute),
+    # patronBlock: patron to get parts of the page (to scrape with patron attribute),
    #               if you need a "block inside another block" you can create a list, please note that all matches
    #               will be packed as string
    # patronNext: patron for scraping next page link
    # action: if you want results perform an action different from "findvideos", useful when scraping film by genres
-    # url_host: string to prepend to scrapedurl, useful when url don't contain host
+    # addVideolibrary: if "add to videolibrary" should appear
    # example usage:
    #   import support
    #   itemlist = []
    #   patron = 'blablabla'
    #   headers = [['Referer', host]]
    #   blacklist = 'Request a TV serie!'
-    #   return support.scrape(item, itemlist, patron, ['thumb', 'quality', 'url', 'title', 'title2', 'year', 'plot', 'episode', 'lang'],
+    #   return support.scrape(item, itemlist, patron, ['thumb', 'quality', 'url', 'title', 'year', 'plot', 'episode', 'lang'],
    #                           headers=headers, blacklist=blacklist)
-    # listGroups
-    #    thumb = immagine, quality = qualità, url = link singolo o gruppo, title = titolo film o serie, title2 = titolo aggiuntivo
-    #    year = anno del film o della serie, plot = descrizione film o serie, episode = numero stagione - numero episodio in caso di serie,
-    #    lang = lingua del video
    # 'type' is a check for typologies of content e.g. Film or TV Series
    # 'episode' is a key to grab episode numbers if it is separated from the title
-    # IMPORTANT 'type' is a special key, to work need type_content_dict={} and type_action_dict={}
+    # IMPORTANT 'type' is a special key, to work need typeContentDict={} and typeActionDict={}

-    itemlist = []
+    def wrapper(*args):
+        itemlist = []

-    if not data:
-        data = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True).data.replace("'", '"')
-        data = re.sub('\n|\t', ' ', data)
-        # replace all ' with " and eliminate newline, so we don't need to worry about
-        log('DATA =', data)
+        args = func(*args)

-        block = data
+        item = args['item']

-        if patron_block:
-            if type(patron_block) == str:
-                patron_block = [patron_block]
+        action = args['action'] if 'action' in args else 'findvideos'
+        anime = args['anime'] if 'anime' in args else ''
+        addVideolibrary = args['addVideolibrary'] if 'addVideolibrary' in args else True
+        blacklist = args['blacklist'] if 'blacklist' in args else ''
+        data = args['data'] if 'data' in args else ''
+        headers = args['headers'] if 'headers' in args else ''
+        patron = args['patron'] if 'patron' in args else ''
+        patronNext = args['patronNext'] if 'patronNext' in args else ''
+        patronBlock = args['patronBlock'] if 'patronBlock' in args else ''
+        typeActionDict = args['type_action_dict'] if 'type_action_dict' in args else {}
+        typeContentDict = args['type_content_dict'] if 'type_content_dict' in args else {}

-            for n, regex in enumerate(patron_block):
-                blocks = scrapertoolsV2.find_multiple_matches(block, regex)
-                block = ""
-                for b in blocks:
-                    block += "\n" + str(b)
-                log('BLOCK ', n, '=', block)
-    else:
-        block = data
-    if patron and listGroups:
-        matches = scrapertoolsV2.find_multiple_matches(block, patron)
-        log('MATCHES =', matches)
+        if not data:
+            data = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True).data.replace("'", '"')
+            data = re.sub('\n|\t', ' ', data)
+            # replace all ' with " and eliminate newline, so we don't need to worry about
+            log('DATA =', data)

-        known_keys = ['url', 'title', 'title2', 'episode', 'thumb', 'quality', 'year', 'plot', 'duration', 'genere', 'rating', 'type', 'lang'] #by greko aggiunto episode
-        lang = '' # aggiunto per gestire i siti con pagine di serietv dove si hanno i video in ita e in subita
-        
-        for match in matches:
-            if len(listGroups) > len(match):  # to fix a bug
-                match = list(match)
-                match.extend([''] * (len(listGroups) - len(match)))
+            block = data

-            scraped = {}
-            for kk in known_keys:
-                val = match[listGroups.index(kk)] if kk in listGroups else ''
-                if val and (kk == "url" or kk == 'thumb') and 'http' not in val:
-                    val = scrapertoolsV2.find_single_match(item.url, 'https?://[a-z0-9.-]+') + val
-                scraped[kk] = val
+            if patronBlock:
+                if type(patronBlock) == str:
+                    patronBlock = [patronBlock]

-            title = scrapertoolsV2.htmlclean(scrapertoolsV2.decodeHtmlentities(scraped["title"])).replace('’', '\'').replace('"', "'").strip() # fix by greko da " a '
-            plot = scrapertoolsV2.htmlclean(scrapertoolsV2.decodeHtmlentities(scraped["plot"]))
-
-            longtitle = typo(title, 'bold')
-            if scraped['quality']: longtitle = longtitle + typo(scraped['quality'], '_ [] color kod')
-            if scraped['episode']:
-                scraped['episode'] = re.sub(r'\s-\s|-|x|&#8211', 'x' , scraped['episode'])
-                longtitle = typo(scraped['episode'] + ' - ', 'bold') + longtitle
-            if scraped['title2']:
-                title2 = scrapertoolsV2.htmlclean(scrapertoolsV2.decodeHtmlentities(scraped["title2"])).replace('"', "'").strip()
-                longtitle = longtitle + typo(title2, 'bold _ -- _')
-
-            ##    Aggiunto/modificato per gestire i siti che hanno i video
-            ##    in ita e subita delle serie tv nella stessa pagina
-            if scraped['lang'] == '': #altrimenti nei canali dei film mi aggiunge sub-ita a tutti i film successivi
-                lang = '' # o in alternativa lang = 'ITA'
-            if scraped['lang']:              
-                if 'sub' in scraped['lang'].lower():
-                    lang = 'Sub-ITA'
-                else:
-                    lang = 'ITA'                      
-            if lang != '':
-                longtitle += typo(lang, '_ [] color kod')
-
-            if item.infoLabels["title"] or item.fulltitle:  # if title is set, probably this is a list of episodes or video sources
-                infolabels = item.infoLabels
-            else:
-                infolabels = {}
-                if scraped["year"]:
-                    infolabels['year'] = scraped["year"]
-                if scraped["plot"]:
-                    infolabels['plot'] = plot
-                if scraped["duration"]:
-                    matches = scrapertoolsV2.find_multiple_matches(scraped["duration"],r'([0-9])\s*?(?:[hH]|:|\.|,|\\|\/|\||\s)\s*?([0-9]+)')
-                    for h, m in matches:
-                        scraped["duration"] = int(h) * 60 + int(m)
-                    if not matches:
-                        scraped["duration"] = scrapertoolsV2.find_single_match(scraped["duration"], r'(\d+)')
-                    infolabels['duration'] = int(scraped["duration"]) * 60
-                if scraped["genere"]:
-                    genres = scrapertoolsV2.find_multiple_matches(scraped["genere"], '[A-Za-z]+')
-                    infolabels['genere'] = ", ".join(genres)
-                if scraped["rating"]:
-                    infolabels['rating'] = scrapertoolsV2.decodeHtmlentities(scraped["rating"])
-
-            if type_content_dict:
-                for name, variants in type_content_dict.items():
-                    if scraped['type'] in variants:
-                        item.contentType = name
-            if type_action_dict:
-                for name, variants in type_action_dict.items():
-                    if scraped['type'] in variants:
-                        action = name
-
-            if inspect.stack()[1][3] == 'episodios':  item.contentType = 'episode'
-
-            if scraped["title"] not in blacklist:
-                it = Item(
-                    channel=item.channel,
-                    action=action,
-                    contentType=item.contentType,
-                    title=longtitle,
-                    fulltitle=title,
-                    show=title,
-                    language = lang if lang != '' else '',
-                    quality=scraped["quality"],
-                    url=scraped["url"],
-                    infoLabels=infolabels,
-                    thumbnail=scraped["thumb"],
-                    args=item.args
-                )
-
-                for lg in list(set(listGroups).difference(known_keys)):
-                    it.__setattr__(lg, match[listGroups.index(lg)])
-
-                itemlist.append(it)
-        checkHost(item, itemlist)
-        if (item.contentType == "tvshow" and (action != "findvideos" and action != "play")) \
-                or (item.contentType == "episode" and action != "play") \
-                or (item.contentType == "movie" and action != "play"):
-            tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)
+                for n, regex in enumerate(patronBlock):
+                    blocks = scrapertoolsV2.find_multiple_matches(block, regex)
+                    block = ""
+                    for b in blocks:
+                        block += "\n" + str(b)
+                    log('BLOCK ', n, '=', block)
        else:
-            for it in itemlist:
-                it.infoLabels = item.infoLabels
+            block = data
+        if patron:
+            matches = scrapertoolsV2.find_multiple_matches_groups(block, patron)
+            log('MATCHES =', matches)

-        if patronNext:
-            nextPage(itemlist, item, data, patronNext, 2)
+            known_keys = ['url', 'title', 'title2', 'episode', 'thumb', 'quality', 'year', 'plot', 'duration', 'genere',
+                          'rating', 'type', 'lang']  # by greko aggiunto episode
+            lang = '' # aggiunto per gestire i siti con pagine di serietv dove si hanno i video in ita e in subita
+            
+            for match in matches:
+                listGroups = match.keys()
+                match = match.values()

-        if addVideolibrary and (item.infoLabels["title"] or item.fulltitle):
-            item.fulltitle = item.infoLabels["title"]
-            videolibrary(itemlist, item)
+                if len(listGroups) > len(match):  # to fix a bug
+                    match = list(match)
+                    match.extend([''] * (len(listGroups) - len(match)))

-    return itemlist
+                scraped = {}
+                for kk in known_keys:
+                    val = match[listGroups.index(kk)] if kk in listGroups else ''
+                    if val and (kk == "url" or kk == 'thumb') and 'http' not in val:
+                        val = scrapertoolsV2.find_single_match(item.url, 'https?://[a-z0-9.-]+') + val
+                    scraped[kk] = val
+
+                title = scrapertoolsV2.htmlclean(scrapertoolsV2.decodeHtmlentities(scraped["title"])
+                                                 .replace('"',"'")).strip()  # fix by greko da " a '
+                plot = scrapertoolsV2.htmlclean(scrapertoolsV2.decodeHtmlentities(scraped["plot"]))
+
+                longtitle = typo(title, 'bold')
+                if scraped['quality']: longtitle = longtitle + typo(scraped['quality'], '_ [] color kod')
+                if scraped['episode']:
+                    scraped['episode'] = re.sub(r'\s-\s|-|x|&#8211', 'x', scraped['episode'])
+                    longtitle = typo(scraped['episode'] + ' - ', 'bold') + longtitle
+                if scraped['title2']:
+                    title2 = scrapertoolsV2.htmlclean(scrapertoolsV2.decodeHtmlentities(scraped["title2"]).replace('"', "'")).strip()
+                    longtitle = longtitle + typo(title2, 'bold _ -- _')
+                    
+                ##    Aggiunto/modificato per gestire i siti che hanno i video
+                ##    in ita e subita delle serie tv nella stessa pagina                             
+                if scraped['lang']:              
+                    if 'sub' in scraped['lang'].lower():
+                        lang = 'Sub-ITA'
+                    else:
+                        lang = 'ITA'                      
+                if lang != '':
+                        longtitle += typo(lang, '_ [] color kod')
+
+                # if title is set, probably this is a list of episodes or video sources
+                if item.infoLabels["title"] or item.fulltitle:  
+                    infolabels = item.infoLabels
+                else:
+                    infolabels = {}
+                    if scraped["year"]:
+                        infolabels['year'] = scraped["year"]
+                    if scraped["plot"]:
+                        infolabels['plot'] = plot
+                    if scraped["duration"]:
+                        matches = scrapertoolsV2.find_multiple_matches(scraped["duration"],
+                                                                       r'([0-9])\s*?(?:[hH]|:|\.|,|\\|\/|\||\s)\s*?([0-9]+)')
+                        for h, m in matches:
+                            scraped["duration"] = int(h) * 60 + int(m)
+                        if not matches:
+                            scraped["duration"] = scrapertoolsV2.find_single_match(scraped["duration"], r'(\d+)')
+                        infolabels['duration'] = int(scraped["duration"]) * 60
+                    if scraped["genere"]:
+                        genres = scrapertoolsV2.find_multiple_matches(scraped["genere"], '[A-Za-z]+')
+                        infolabels['genere'] = ", ".join(genres)
+                    if scraped["rating"]:
+                        infolabels['rating'] = scrapertoolsV2.decodeHtmlentities(scraped["rating"])
+
+                if typeContentDict:
+                    for name, variants in typeContentDict.items():
+                        if scraped['type'] in variants:
+                            item.contentType = name
+                if typeActionDict:
+                    for name, variants in typeActionDict.items():
+                        if scraped['type'] in variants:
+                            action = name
+
+                if scraped["title"] not in blacklist:
+                    it = Item(
+                        channel=item.channel,
+                        action=action,
+                        contentType=item.contentType,
+                        title=longtitle,
+                        fulltitle=title,
+                        show=title,
+                        quality=scraped["quality"],
+                        url=scraped["url"],
+                        infoLabels=infolabels,
+                        thumbnail=scraped["thumb"],
+                        args=item.args
+                    )
+
+                    for lg in list(set(listGroups).difference(known_keys)):
+                        it.__setattr__(lg, match[listGroups.index(lg)])
+
+                    if 'itemHook' in args:
+                        it = args['itemHook'](it)
+                    itemlist.append(it)
+            checkHost(item, itemlist)
+##            if (item.contentType == "episode" and (action != "findvideos" and action != "play")) \
+##                    or (item.contentType == "movie" and action != "play"):
+            if (item.contentType == "tvshow" and (action != "findvideos" and action != "play")) \
+                or (item.contentType == "episode" and action != "play") \
+                or (item.contentType == "movie" and action != "play") :            
+                tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)
+            else:
+                for it in itemlist:
+                    it.infoLabels = item.infoLabels
+                
+            if 'itemlistHook' in args:
+                itemlist = args['itemlistHook'](itemlist)
+
+            if patronNext:
+                nextPage(itemlist, item, data, patronNext, 2)
+
+            if anime:
+                from specials import autorenumber
+                autorenumber.renumber(itemlist)
+                
+            if addVideolibrary and (item.infoLabels["title"] or item.fulltitle):
+                item.fulltitle = item.infoLabels["title"]
+                videolibrary(itemlist, item)
+                
+            if 'fullItemlistHook' in args:
+                itemlist = args['fullItemlistHook'](itemlist)
+
+        return itemlist
+
+    return wrapper


 def checkHost(item, itemlist):
@@ -398,13 +428,9 @@ def swzz_get_url(item):
    return data


-def menu(itemlist, title='', action='', url='', contentType='movie', args=[]):
+def menuItem(itemlist, filename, title='', action='', url='', contentType='movie', args=[]):
    # Function to simplify menu creation

-    frame = inspect.stack()[1]
-    filename = frame[0].f_code.co_filename
-    filename = os.path.basename(filename).replace('.py','')
-
    # Call typo function
    title = typo(title)

@@ -428,6 +454,51 @@ def menu(itemlist, title='', action='', url='', contentType='movie', args=[]):
    return itemlist


+def menu(func):
+    def wrapper(*args):
+        args = func(*args)
+
+        item = args['item']
+        host = func.__globals__['host']
+        list_servers = func.__globals__['list_servers']
+        list_quality = func.__globals__['list_quality']
+        filename = func.__module__.split('.')[1]
+
+        listUrls = ['film', 'filmSub', 'tvshow', 'tvshowSub']
+        dictUrl = {}
+        for name in listUrls:
+            dictUrl[name] = args[name] if name in args else None
+        autoplay.init(item.channel, list_servers, list_quality)
+
+        # Main options
+        itemlist = []
+        if dictUrl['film'] is not None:
+            menuItem(itemlist, filename, 'Film bold', 'peliculas', host + dictUrl['film'])
+            ### modificato by greko ########
+            for sub, var in dictUrl['filmSub']:
+                menuItem(itemlist, filename, sub + ' submenu', var[1],
+                         host + var[0],
+                         args=var[2] if len(var)>2 else '')
+
+            menuItem(itemlist, filename, 'Cerca submenu bold', 'search', host, args='film')
+
+        if dictUrl['tvshow'] is not None:
+            menuItem(itemlist, filename, 'Serie TV bold', 'peliculas', host + dictUrl['tvshow'], contentType='tvshow')
+
+            for sub, var in dictUrl['tvshowSub']:
+                menuItem(itemlist, filename, sub + ' submenu', var[1],
+                         host + var[0], contentType='tvshow',
+                         args=var[2] if len(var)>2 else '')
+
+            menuItem(itemlist, filename, 'Cerca submenu bold', 'search', host, args='serie')
+            ### fine by greko ########
+        autoplay.show_option(item.channel, itemlist)
+
+        return itemlist
+
+    return wrapper
+
+
 def typo(string, typography=''):

    kod_color = '0xFF65B3DA' #'0xFF0081C2'
@@ -480,7 +551,7 @@ def typo(string, typography=''):
    return string


-def match(item, patron='', patron_block='', headers='', url=''):
+def match(item, patron='', patronBlock='', headers='', url=''):
    matches = []
    url = url if url else item.url
    data = httptools.downloadpage(url, headers=headers, ignore_response_code=True).data.replace("'", '"')
@@ -488,8 +559,8 @@ def match(item, patron='', patron_block='', headers='', url=''):
    data = re.sub(r'>\s\s*<', '><', data)
    log('DATA= ', data)

-    if patron_block:
-        block = scrapertoolsV2.find_single_match(data, patron_block)
+    if patronBlock:
+        block = scrapertoolsV2.find_single_match(data, patronBlock)
        log('BLOCK= ',block)
    else:
        block = data
@@ -546,7 +617,8 @@ def nextPage(itemlist, item, data='', patron='', function_level=1, next_page='',
        log('NEXT= ', next_page)
        itemlist.append(
            Item(channel=item.channel,
-                 action=inspect.stack()[function_level][3],
+                 #action=inspect.stack()[function_level][3],
+                 action = item.action,
                 contentType=item.contentType,
                 title=typo(config.get_localized_string(30992), 'color kod bold'),
                 url=next_page,
@@ -556,7 +628,7 @@ def nextPage(itemlist, item, data='', patron='', function_level=1, next_page='',
    return itemlist

 def pagination(itemlist, item, page, perpage, function_level=1):
-    if len(itemlist) >= perpage: # page * perpage
+    if len(itemlist) >= page * perpage:
        itemlist.append(
            Item(channel=item.channel,
                 action=inspect.stack()[function_level][3],