Skip to content

Commit

Permalink
fix/videos (#3)
Browse files Browse the repository at this point in the history
* fix/videos

port pytube/pytube#1422

* rm debug print leftover
  • Loading branch information
JarbasAl authored Sep 4, 2024
1 parent 0828aad commit fcae460
Show file tree
Hide file tree
Showing 3 changed files with 212 additions and 32 deletions.
4 changes: 4 additions & 0 deletions tutubo/pytube/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,10 @@ def length(self) -> int:
"""
return int(self.vid_info.get('videoDetails', {}).get('lengthSeconds'))

@property
def is_live(self) -> bool:
return self.length == 0

@property
def views(self) -> int:
"""Get the number of the times the video has been viewed.
Expand Down
236 changes: 205 additions & 31 deletions tutubo/pytube/contrib/channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import logging
from typing import Dict, List, Tuple, Optional, Iterable

from tutubo.pytube import extract, Playlist, request
import requests

from tutubo.pytube import extract, YouTube, Playlist, request
from tutubo.pytube.helpers import uniqueify, cache, DeferredGeneratorList

logger = logging.getLogger(__name__)
Expand All @@ -28,22 +30,22 @@ def __init__(self, url: str, proxies: Optional[Dict[str, str]] = None):
)

self.videos_url = self.channel_url + '/videos'
self.shorts_url = self.channel_url + '/shorts'
self.live_url = self.channel_url + '/streams'
self.playlists_url = self.channel_url + '/playlists'
self.community_url = self.channel_url + '/community'
self.featured_channels_url = self.channel_url + '/channels'
self.about_url = self.channel_url + '/about'
self.shorts_url = self.channel_url + '/shorts'
self.live_url = self.channel_url + '/streams'

self._html_url = self.videos_url # Videos will be preferred over short videos and live
self._visitor_data = None

# Possible future additions
self._playlists_html = None
self._community_html = None
self._featured_channels_html = None
self._about_html = None

self._html_url = self.videos_url # Videos will be preferred over short videos and live
self._visitor_data = None

@property
def channel_name(self):
"""Get the name of the YouTube channel.
Expand Down Expand Up @@ -72,15 +74,34 @@ def vanity_url(self):
"""
return self.initial_data['metadata']['channelMetadataRenderer'].get('vanityChannelUrl', None) # noqa:E501

@property
def html_url(self):
"""Get the html url.
:rtype: str
"""
return self._html_url

@html_url.setter
def html_url(self, value):
"""Set the html url and clear the cache."""
if self._html_url != value:
self._html = None
self._initial_data = None
self.__class__.video_urls.fget.cache_clear()
self.__class__.last_updated.fget.cache_clear()
self.__class__.title.fget.cache_clear()
self._html_url = value

@property
def html(self):
"""Get the html for the /videos page.
"""Get the html for the /videos or /shorts page.
:rtype: str
"""
if self._html:
return self._html
self._html = request.get(self.videos_url)
self._html = requests.get(self.html_url).text
return self._html

@property
Expand Down Expand Up @@ -139,8 +160,41 @@ def about_html(self):
self._about_html = request.get(self.about_url)
return self._about_html

@staticmethod
def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]:
def _build_continuation_url(self, continuation: str) -> Tuple[str, dict, dict]:
"""Helper method to build the url and headers required to request
the next page of videos
:param str continuation: Continuation extracted from the json response
of the last page
:rtype: Tuple[str, dict, dict]
:returns: Tuple of an url and required headers for the next http
request
"""
return (
(
# was changed to this format (and post requests)
# between 2022.11.06 and 2022.11.20
"https://www.youtube.com/youtubei/v1/browse?key="
f"{self.yt_api_key}"
),
{
"X-YouTube-Client-Name": "1",
"X-YouTube-Client-Version": "2.20200720.00.02",
},
# extra data required for post request
{
"continuation": continuation,
"context": {
"client": {
"clientName": "WEB",
"visitorData": self._visitor_data,
"clientVersion": "2.20200720.00.02"
}
}
}
)

def _extract_videos(self, raw_json: str) -> Tuple[List[str], Optional[str]]:
"""Extracts videos from a raw json page
:param str raw_json: Input json extracted from the page or the last
Expand All @@ -153,12 +207,22 @@ def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]:
# this is the json tree structure, if the json was extracted from
# html
try:
videos = initial_data["contents"][
"twoColumnBrowseResultsRenderer"][
"tabs"][1]["tabRenderer"]["content"][
"sectionListRenderer"]["contents"][0][
"itemSectionRenderer"]["contents"][0][
"gridRenderer"]["items"]
# Possible tabs: Home, Videos, Shorts, Live, Playlists, Community, Channels, About
active_tab = {}
for tab in initial_data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"]:
tab_url = tab["tabRenderer"]["endpoint"]["commandMetadata"]["webCommandMetadata"]["url"]
if tab_url.rsplit('/', maxsplit=1)[-1] == self.html_url.rsplit('/', maxsplit=1)[-1]:
active_tab = tab
break

# This is the json tree structure for videos, shorts and streams
videos = active_tab["tabRenderer"]["content"]["richGridRenderer"]["contents"]

# This is the json tree structure of visitor data
# It is necessary to send the visitorData together with the continuation token
self._visitor_data = initial_data["responseContext"]["webResponseContextExtensionData"][
"ytConfigData"]["visitorData"]

except (KeyError, IndexError, TypeError):
try:
# this is the json tree structure, if the json was directly sent
Expand Down Expand Up @@ -188,22 +252,102 @@ def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]:
# if there is an error, no continuation is available
continuation = None

# only extract the video ids from the video data
videos_url = []
try:
# Extract id from videos and live
for x in videos:
videos_url.append(f"/watch?v="
f"{x['richItemRenderer']['content']['videoRenderer']['videoId']}")
except (KeyError, IndexError, TypeError):
# Extract id from short videos
for x in videos:
videos_url.append(f"/watch?v="
f"{x['richItemRenderer']['content']['reelItemRenderer']['videoId']}")

# remove duplicates
return (
uniqueify(
list(
# only extract the video ids from the video data
map(
lambda x: (
f"/watch?v="
f"{x['gridVideoRenderer']['videoId']}"
),
videos
)
),
),
continuation,
)
return uniqueify(videos_url), continuation

@property
def views(self) -> int:
"""Extract view count for channel.
:return: Channel view count
:rtype: int
"""
self.html_url = self.about_url
# Get the position of the "about" tab.
about_tab_pos = len(self.initial_data['contents']['twoColumnBrowseResultsRenderer']['tabs']) - 2
try:
views_text = self.initial_data['contents']['twoColumnBrowseResultsRenderer']['tabs'][about_tab_pos][
'tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
'contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText']

# "1,234,567 view"
count_text = views_text.split(' ')[0]
# "1234567"
count_text = count_text.replace(',', '')
return int(count_text)
except KeyError:
return 0

@property
@cache
def title(self) -> str:
"""Extract the channel title.
:return: Channel title (name)
:rtype: str
"""
self.html_url = self.channel_url
return self.initial_data['metadata']['channelMetadataRenderer']['title']

@property
def description(self) -> str:
"""Extract the channel description.
:return: Channel description
:rtype: str
"""
self.html_url = self.channel_url
return self.initial_data['metadata']['channelMetadataRenderer']['description']

@property
def length(self):
"""Extracts the approximate amount of videos from the channel.
:return: Channel videos count
:rtype: str
"""
self.html_url = self.channel_url
return self.initial_data['header']['c4TabbedHeaderRenderer']['videosCountText']['runs'][0]['text']

@property
@cache
def last_updated(self):
"""Extract the date of the last uploaded video.
:return: Last video uploaded
:rtype: str
"""
self.html_url = self.videos_url
try:
last_updated_text = self.initial_data['contents']['twoColumnBrowseResultsRenderer']['tabs'][1][
'tabRenderer']['content']['richGridRenderer']['contents'][0]['richItemRenderer']['content'][
'videoRenderer']['publishedTimeText']['simpleText']
return last_updated_text
except KeyError:
return None

@property
def thumbnail_url(self) -> str:
"""extract the profile image from the json of the channel home page
:rtype: str
:return: a string with the url of the channel's profile image
"""
self.html_url = self.channel_url # get the url of the channel home page
return self.initial_data['metadata']['channelMetadataRenderer']['avatar']['thumbnails'][0]['url']

@property
def playlists(self) -> Iterable[Playlist]:
Expand Down Expand Up @@ -307,6 +451,36 @@ def playlist_urls(self) -> DeferredGeneratorList:
"""
return DeferredGeneratorList(self.playlist_url_generator())

@property
def videos(self) -> Iterable[YouTube]:
"""Yields YouTube objects of videos in this channel
:rtype: List[YouTube]
:returns: List of YouTube
"""
self.html_url = self.videos_url # Set video tab
return DeferredGeneratorList(self.videos_generator())

@property
def shorts(self) -> Iterable[YouTube]:
"""Yields YouTube objects of short videos in this channel
:rtype: List[YouTube]
:returns: List of YouTube
"""
self.html_url = self.shorts_url # Set shorts tab
return DeferredGeneratorList(self.videos_generator())

@property
def live(self) -> Iterable[YouTube]:
"""Yields YouTube objects of live in this channel
:rtype: List[YouTube]
:returns: List of YouTube
"""
self.html_url = self.live_url # Set stream tab
return DeferredGeneratorList(self.videos_generator())

@property
def as_dict(self):
return {'channelId': self.channel_id,
Expand Down
4 changes: 3 additions & 1 deletion tutubo/pytube/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,10 @@ def channel_name(url: str) -> str:
if function_match:
logger.debug("finished regex search, matched: %s", pattern)
uri_style = function_match.group(1)
uri_style = uri_style if uri_style else "c"
uri_identifier = function_match.group(2)
if "@" in url:
return f"/@{uri_identifier}"
uri_style = uri_style if uri_style else "c"
return f'/{uri_style}/{uri_identifier}'

raise RegexMatchError(
Expand Down

0 comments on commit fcae460

Please sign in to comment.