[youtube:tab] Fix feeds extraction (closes ytdl-org#25695, closes ytd…

…l-org#26452)
ThirumalaiK · Jan 28, 2021 · cfab868 · cfab868
1 parent bfe4af3
commit cfab868
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 71 deletions.
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
@@ -61,9 +61,10 @@ def test_youtube_channel_matching(self):
     #     self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
 
     def test_youtube_feeds(self):
-        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])
-        self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
-        self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
+        self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab'])
+        self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab'])
+        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab'])
+        self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab'])
 
     # def test_youtube_search_matching(self):
     #     self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])

diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
@@ -33,7 +33,6 @@
     get_element_by_id,
     int_or_none,
     mimetype2ext,
-    orderedSet,
     parse_codecs,
     parse_duration,
     remove_quotes,
@@ -2381,7 +2380,19 @@ def decrypt_sig(mobj):
 
 class YoutubeTabIE(YoutubeBaseInfoExtractor):
     IE_DESC = 'YouTube.com tab'
-    _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:\w+\.)?
+                        (?:
+                            youtube(?:kids)?\.com|
+                            invidio\.us
+                        )/
+                        (?:
+                            (?:channel|c|user|feed)/|
+                            (?:playlist|watch)\?.*?\blist=
+                        )
+                        (?P<id>[^/?\#&]+)
+                    '''
     IE_NAME = 'youtube:tab'
 
     _TESTS = [{
@@ -2620,7 +2631,30 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
     }, {
         'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
         'only_matching': True,
-    },
+    }, {
+        'url': 'https://www.youtube.com/feed/trending',
+        'only_matching': True,
+    }, {
+        # needs auth
+        'url': 'https://www.youtube.com/feed/library',
+        'only_matching': True,
+    }, {
+        # needs auth
+        'url': 'https://www.youtube.com/feed/history',
+        'only_matching': True,
+    }, {
+        # needs auth
+        'url': 'https://www.youtube.com/feed/subscriptions',
+        'only_matching': True,
+    }, {
+        # needs auth
+        'url': 'https://www.youtube.com/feed/watch_later',
+        'only_matching': True,
+    }, {
+        # no longer available?
+        'url': 'https://www.youtube.com/feed/recommended',
+        'only_matching': True,
+    }
         # TODO
         # {
         #     'url': 'https://www.youtube.com/TheYoungTurks/live',
@@ -2707,27 +2741,34 @@ def _grid_entries(self, grid_renderer):
                     'https://www.youtube.com/channel/%s' % channel_id,
                     ie=YoutubeTabIE.ie_key(), video_title=title)
 
-    def _shelf_entries_trimmed(self, shelf_renderer):
-        renderer = try_get(
-            shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict)
-        if not renderer:
+    def _shelf_entries_from_content(self, shelf_renderer):
+        content = shelf_renderer.get('content')
+        if not isinstance(content, dict):
             return
-        # TODO: add support for nested playlists so each shelf is processed
-        # as separate playlist
-        # TODO: this includes only first N items
-        for entry in self._grid_entries(renderer):
-            yield entry
+        renderer = content.get('gridRenderer')
+        if renderer:
+            # TODO: add support for nested playlists so each shelf is processed
+            # as separate playlist
+            # TODO: this includes only first N items
+            for entry in self._grid_entries(renderer):
+                yield entry
+        renderer = content.get('horizontalListRenderer')
+        if renderer:
+            # TODO
+            pass
 
     def _shelf_entries(self, shelf_renderer):
         ep = try_get(
             shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
             compat_str)
         shelf_url = urljoin('https://www.youtube.com', ep)
-        if not shelf_url:
-            return
-        title = try_get(
-            shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
-        yield self.url_result(shelf_url, video_title=title)
+        if shelf_url:
+            title = try_get(
+                shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
+            yield self.url_result(shelf_url, video_title=title)
+        # Shelf may not contain shelf URL, fallback to extraction from content
+        for entry in self._shelf_entries_from_content(shelf_renderer):
+            yield entry
 
     def _playlist_entries(self, video_list_renderer):
         for content in video_list_renderer['contents']:
@@ -2832,8 +2873,11 @@ def _extract_continuation(cls, renderer):
             }
 
     def _entries(self, tab, identity_token):
+        slr_renderer = try_get(tab, lambda x: x['sectionListRenderer'], dict)
+        if not slr_renderer:
+            return
         continuation = None
-        slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or []
+        slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or []
         for slr_content in slr_contents:
             if not isinstance(slr_content, dict):
                 continue
@@ -2876,6 +2920,9 @@ def _entries(self, tab, identity_token):
             if not continuation:
                 continuation = self._extract_continuation(is_renderer)
 
+        if not continuation:
+            continuation = self._extract_continuation(slr_renderer)
+
         headers = {
             'x-youtube-client-name': '1',
             'x-youtube-client-version': '2.20201112.04.01',
@@ -2924,7 +2971,7 @@ def _entries(self, tab, identity_token):
                 continuation_item = continuation_items[0]
                 if not isinstance(continuation_item, dict):
                     continue
-                renderer = continuation_item.get('playlistVideoRenderer')
+                renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
                 if renderer:
                     video_list_renderer = {'contents': continuation_items}
                     for entry in self._playlist_entries(video_list_renderer):
@@ -2969,6 +3016,7 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
         selected_tab = self._extract_selected_tab(tabs)
         renderer = try_get(
             data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
+        playlist_id = title = description = None
         if renderer:
             channel_title = renderer.get('title') or item_id
             tab_title = selected_tab.get('title')
@@ -3289,10 +3337,10 @@ def _real_extract(self, url):
 """
 
 
-class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
+class YoutubeFeedsInfoExtractor(YoutubeTabIE):
     """
     Base class for feed extractors
-    Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
+    Subclasses must define the _FEED_NAME property.
     """
     _LOGIN_REQUIRED = True
 
@@ -3303,55 +3351,17 @@ def IE_NAME(self):
     def _real_initialize(self):
         self._login()
 
-    def _entries(self, page):
-        # The extraction process is the same as for playlists, but the regex
-        # for the video ids doesn't contain an index
-        ids = []
-        more_widget_html = content_html = page
-        for page_num in itertools.count(1):
-            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
-
-            # 'recommended' feed has infinite 'load more' and each new portion spins
-            # the same videos in (sometimes) slightly different order, so we'll check
-            # for unicity and break when portion has no new videos
-            new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
-            if not new_ids:
-                break
-
-            ids.extend(new_ids)
-
-            for entry in self._ids_to_results(new_ids):
-                yield entry
-
-            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-            if not mobj:
-                break
-
-            more = self._download_json(
-                'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
-                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape,
-                headers=self._YOUTUBE_CLIENT_HEADERS)
-            content_html = more['content_html']
-            more_widget_html = more['load_more_widget_html']
-
     def _real_extract(self, url):
-        page = self._download_webpage(
+        return self.url_result(
             'https://www.youtube.com/feed/%s' % self._FEED_NAME,
-            self._PLAYLIST_TITLE)
-        return self.playlist_result(
-            self._entries(page), playlist_title=self._PLAYLIST_TITLE)
+            ie=YoutubeTabIE.ie_key())
 
 
 class YoutubeWatchLaterIE(InfoExtractor):
     IE_NAME = 'youtube:watchlater'
     IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
-    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater'
-
+    _VALID_URL = r':ytwatchlater'
     _TESTS = [{
-        'url': 'https://www.youtube.com/feed/watch_later',
-        'only_matching': True,
-    }, {
         'url': ':ytwatchlater',
         'only_matching': True,
     }]
@@ -3363,23 +3373,38 @@ def _real_extract(self, url):
 
 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
     IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
-    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+    _VALID_URL = r':ytrec(?:ommended)?'
     _FEED_NAME = 'recommended'
-    _PLAYLIST_TITLE = 'Youtube Recommended videos'
+    _TESTS = [{
+        'url': ':ytrec',
+        'only_matching': True,
+    }, {
+        'url': ':ytrecommended',
+        'only_matching': True,
+    }]
 
 
 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
-    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+    _VALID_URL = r':ytsubs(?:criptions)?'
     _FEED_NAME = 'subscriptions'
-    _PLAYLIST_TITLE = 'Youtube Subscriptions'
+    _TESTS = [{
+        'url': ':ytsubs',
+        'only_matching': True,
+    }, {
+        'url': ':ytsubscriptions',
+        'only_matching': True,
+    }]
 
 
 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
     IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
-    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
+    _VALID_URL = r':ythistory'
     _FEED_NAME = 'history'
-    _PLAYLIST_TITLE = 'Youtube History'
+    _TESTS = [{
+        'url': ':ythistory',
+        'only_matching': True,
+    }]
 
 
 class YoutubeTruncatedURLIE(InfoExtractor):