Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fork Sync #34

Merged
merged 1 commit into from
Nov 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions test/test_all_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,10 @@ def test_youtube_channel_matching(self):
# self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])

def test_youtube_feeds(self):
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab'])
self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab'])
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab'])
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab'])

# def test_youtube_search_matching(self):
# self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
Expand Down
161 changes: 93 additions & 68 deletions youtube_dl/extractor/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
get_element_by_id,
int_or_none,
mimetype2ext,
orderedSet,
parse_codecs,
parse_duration,
remove_quotes,
Expand Down Expand Up @@ -2381,7 +2380,19 @@ def decrypt_sig(mobj):

class YoutubeTabIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com tab'
_VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)'
_VALID_URL = r'''(?x)
https?://
(?:\w+\.)?
(?:
youtube(?:kids)?\.com|
invidio\.us
)/
(?:
(?:channel|c|user|feed)/|
(?:playlist|watch)\?.*?\blist=
)
(?P<id>[^/?\#&]+)
'''
IE_NAME = 'youtube:tab'

_TESTS = [{
Expand Down Expand Up @@ -2620,7 +2631,30 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
}, {
'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
'only_matching': True,
},
}, {
'url': 'https://www.youtube.com/feed/trending',
'only_matching': True,
}, {
# needs auth
'url': 'https://www.youtube.com/feed/library',
'only_matching': True,
}, {
# needs auth
'url': 'https://www.youtube.com/feed/history',
'only_matching': True,
}, {
# needs auth
'url': 'https://www.youtube.com/feed/subscriptions',
'only_matching': True,
}, {
# needs auth
'url': 'https://www.youtube.com/feed/watch_later',
'only_matching': True,
}, {
# no longer available?
'url': 'https://www.youtube.com/feed/recommended',
'only_matching': True,
}
# TODO
# {
# 'url': 'https://www.youtube.com/TheYoungTurks/live',
Expand Down Expand Up @@ -2707,27 +2741,34 @@ def _grid_entries(self, grid_renderer):
'https://www.youtube.com/channel/%s' % channel_id,
ie=YoutubeTabIE.ie_key(), video_title=title)

def _shelf_entries_trimmed(self, shelf_renderer):
renderer = try_get(
shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict)
if not renderer:
def _shelf_entries_from_content(self, shelf_renderer):
content = shelf_renderer.get('content')
if not isinstance(content, dict):
return
# TODO: add support for nested playlists so each shelf is processed
# as separate playlist
# TODO: this includes only first N items
for entry in self._grid_entries(renderer):
yield entry
renderer = content.get('gridRenderer')
if renderer:
# TODO: add support for nested playlists so each shelf is processed
# as separate playlist
# TODO: this includes only first N items
for entry in self._grid_entries(renderer):
yield entry
renderer = content.get('horizontalListRenderer')
if renderer:
# TODO
pass

def _shelf_entries(self, shelf_renderer):
ep = try_get(
shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
compat_str)
shelf_url = urljoin('https://www.youtube.com', ep)
if not shelf_url:
return
title = try_get(
shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
yield self.url_result(shelf_url, video_title=title)
if shelf_url:
title = try_get(
shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
yield self.url_result(shelf_url, video_title=title)
# Shelf may not contain shelf URL, fallback to extraction from content
for entry in self._shelf_entries_from_content(shelf_renderer):
yield entry

def _playlist_entries(self, video_list_renderer):
for content in video_list_renderer['contents']:
Expand Down Expand Up @@ -2832,8 +2873,11 @@ def _extract_continuation(cls, renderer):
}

def _entries(self, tab, identity_token):
slr_renderer = try_get(tab, lambda x: x['sectionListRenderer'], dict)
if not slr_renderer:
return
continuation = None
slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or []
slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or []
for slr_content in slr_contents:
if not isinstance(slr_content, dict):
continue
Expand Down Expand Up @@ -2876,6 +2920,9 @@ def _entries(self, tab, identity_token):
if not continuation:
continuation = self._extract_continuation(is_renderer)

if not continuation:
continuation = self._extract_continuation(slr_renderer)

headers = {
'x-youtube-client-name': '1',
'x-youtube-client-version': '2.20201112.04.01',
Expand Down Expand Up @@ -2924,7 +2971,7 @@ def _entries(self, tab, identity_token):
continuation_item = continuation_items[0]
if not isinstance(continuation_item, dict):
continue
renderer = continuation_item.get('playlistVideoRenderer')
renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
if renderer:
video_list_renderer = {'contents': continuation_items}
for entry in self._playlist_entries(video_list_renderer):
Expand Down Expand Up @@ -2969,6 +3016,7 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
selected_tab = self._extract_selected_tab(tabs)
renderer = try_get(
data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
playlist_id = title = description = None
if renderer:
channel_title = renderer.get('title') or item_id
tab_title = selected_tab.get('title')
Expand Down Expand Up @@ -3289,10 +3337,10 @@ def _real_extract(self, url):
"""


class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
class YoutubeFeedsInfoExtractor(YoutubeTabIE):
"""
Base class for feed extractors
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
Subclasses must define the _FEED_NAME property.
"""
_LOGIN_REQUIRED = True

Expand All @@ -3303,55 +3351,17 @@ def IE_NAME(self):
def _real_initialize(self):
self._login()

def _entries(self, page):
# The extraction process is the same as for playlists, but the regex
# for the video ids doesn't contain an index
ids = []
more_widget_html = content_html = page
for page_num in itertools.count(1):
matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

# 'recommended' feed has infinite 'load more' and each new portion spins
# the same videos in (sometimes) slightly different order, so we'll check
# for unicity and break when portion has no new videos
new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
if not new_ids:
break

ids.extend(new_ids)

for entry in self._ids_to_results(new_ids):
yield entry

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
if not mobj:
break

more = self._download_json(
'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
'Downloading page #%s' % page_num,
transform_source=uppercase_escape,
headers=self._YOUTUBE_CLIENT_HEADERS)
content_html = more['content_html']
more_widget_html = more['load_more_widget_html']

def _real_extract(self, url):
page = self._download_webpage(
return self.url_result(
'https://www.youtube.com/feed/%s' % self._FEED_NAME,
self._PLAYLIST_TITLE)
return self.playlist_result(
self._entries(page), playlist_title=self._PLAYLIST_TITLE)
ie=YoutubeTabIE.ie_key())


class YoutubeWatchLaterIE(InfoExtractor):
IE_NAME = 'youtube:watchlater'
IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater'

_VALID_URL = r':ytwatchlater'
_TESTS = [{
'url': 'https://www.youtube.com/feed/watch_later',
'only_matching': True,
}, {
'url': ':ytwatchlater',
'only_matching': True,
}]
Expand All @@ -3363,23 +3373,38 @@ def _real_extract(self, url):

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
_VALID_URL = r':ytrec(?:ommended)?'
_FEED_NAME = 'recommended'
_PLAYLIST_TITLE = 'Youtube Recommended videos'
_TESTS = [{
'url': ':ytrec',
'only_matching': True,
}, {
'url': ':ytrecommended',
'only_matching': True,
}]


class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
_VALID_URL = r':ytsubs(?:criptions)?'
_FEED_NAME = 'subscriptions'
_PLAYLIST_TITLE = 'Youtube Subscriptions'
_TESTS = [{
'url': ':ytsubs',
'only_matching': True,
}, {
'url': ':ytsubscriptions',
'only_matching': True,
}]


class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
_VALID_URL = r':ythistory'
_FEED_NAME = 'history'
_PLAYLIST_TITLE = 'Youtube History'
_TESTS = [{
'url': ':ythistory',
'only_matching': True,
}]


class YoutubeTruncatedURLIE(InfoExtractor):
Expand Down