fix/videos (#3)

* fix/videos port pytube/pytube#1422 * rm debug print leftover
OpenJarbas · Sep 4, 2024 · fcae460 · fcae460
1 parent 0828aad
commit fcae460
Show file tree

Hide file tree

Showing 3 changed files with 212 additions and 32 deletions.
diff --git a/tutubo/pytube/__main__.py b/tutubo/pytube/__main__.py
@@ -380,6 +380,10 @@ def length(self) -> int:
         """
         return int(self.vid_info.get('videoDetails', {}).get('lengthSeconds'))
 
+    @property
+    def is_live(self) -> bool:
+        return self.length == 0
+
     @property
     def views(self) -> int:
         """Get the number of the times the video has been viewed.

diff --git a/tutubo/pytube/contrib/channel.py b/tutubo/pytube/contrib/channel.py
@@ -4,7 +4,9 @@
 import logging
 from typing import Dict, List, Tuple, Optional, Iterable
 
-from tutubo.pytube import extract, Playlist, request
+import requests
+
+from tutubo.pytube import extract, YouTube, Playlist, request
 from tutubo.pytube.helpers import uniqueify, cache, DeferredGeneratorList
 
 logger = logging.getLogger(__name__)
@@ -28,22 +30,22 @@ def __init__(self, url: str, proxies: Optional[Dict[str, str]] = None):
         )
 
         self.videos_url = self.channel_url + '/videos'
+        self.shorts_url = self.channel_url + '/shorts'
+        self.live_url = self.channel_url + '/streams'
         self.playlists_url = self.channel_url + '/playlists'
         self.community_url = self.channel_url + '/community'
         self.featured_channels_url = self.channel_url + '/channels'
         self.about_url = self.channel_url + '/about'
-        self.shorts_url = self.channel_url + '/shorts'
-        self.live_url = self.channel_url + '/streams'
+
+        self._html_url = self.videos_url  # Videos will be preferred over short videos and live
+        self._visitor_data = None
 
         # Possible future additions
         self._playlists_html = None
         self._community_html = None
         self._featured_channels_html = None
         self._about_html = None
 
-        self._html_url = self.videos_url  # Videos will be preferred over short videos and live
-        self._visitor_data = None
-
     @property
     def channel_name(self):
         """Get the name of the YouTube channel.
@@ -72,15 +74,34 @@ def vanity_url(self):
         """
         return self.initial_data['metadata']['channelMetadataRenderer'].get('vanityChannelUrl', None)  # noqa:E501
 
+    @property
+    def html_url(self):
+        """Get the html url.
+
+        :rtype: str
+        """
+        return self._html_url
+
+    @html_url.setter
+    def html_url(self, value):
+        """Set the html url and clear the cache."""
+        if self._html_url != value:
+            self._html = None
+            self._initial_data = None
+            self.__class__.video_urls.fget.cache_clear()
+            self.__class__.last_updated.fget.cache_clear()
+            self.__class__.title.fget.cache_clear()
+            self._html_url = value
+
     @property
     def html(self):
-        """Get the html for the /videos page.
+        """Get the html for the /videos or /shorts page.
 
         :rtype: str
         """
         if self._html:
             return self._html
-        self._html = request.get(self.videos_url)
+        self._html = requests.get(self.html_url).text
         return self._html
 
     @property
@@ -139,8 +160,41 @@ def about_html(self):
             self._about_html = request.get(self.about_url)
             return self._about_html
 
-    @staticmethod
-    def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]:
+    def _build_continuation_url(self, continuation: str) -> Tuple[str, dict, dict]:
+        """Helper method to build the url and headers required to request
+        the next page of videos
+
+        :param str continuation: Continuation extracted from the json response
+            of the last page
+        :rtype: Tuple[str, dict, dict]
+        :returns: Tuple of an url and required headers for the next http
+            request
+        """
+        return (
+            (
+                # was changed to this format (and post requests)
+                # between 2022.11.06 and 2022.11.20
+                "https://www.youtube.com/youtubei/v1/browse?key="
+                f"{self.yt_api_key}"
+            ),
+            {
+                "X-YouTube-Client-Name": "1",
+                "X-YouTube-Client-Version": "2.20200720.00.02",
+            },
+            # extra data required for post request
+            {
+                "continuation": continuation,
+                "context": {
+                    "client": {
+                        "clientName": "WEB",
+                        "visitorData": self._visitor_data,
+                        "clientVersion": "2.20200720.00.02"
+                    }
+                }
+            }
+        )
+
+    def _extract_videos(self, raw_json: str) -> Tuple[List[str], Optional[str]]:
         """Extracts videos from a raw json page
 
         :param str raw_json: Input json extracted from the page or the last
@@ -153,12 +207,22 @@ def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]:
         # this is the json tree structure, if the json was extracted from
         # html
         try:
-            videos = initial_data["contents"][
-                "twoColumnBrowseResultsRenderer"][
-                "tabs"][1]["tabRenderer"]["content"][
-                "sectionListRenderer"]["contents"][0][
-                "itemSectionRenderer"]["contents"][0][
-                "gridRenderer"]["items"]
+            # Possible tabs: Home, Videos, Shorts, Live, Playlists, Community, Channels, About
+            active_tab = {}
+            for tab in initial_data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"]:
+                tab_url = tab["tabRenderer"]["endpoint"]["commandMetadata"]["webCommandMetadata"]["url"]
+                if tab_url.rsplit('/', maxsplit=1)[-1] == self.html_url.rsplit('/', maxsplit=1)[-1]:
+                    active_tab = tab
+                    break
+
+            # This is the json tree structure for videos, shorts and streams
+            videos = active_tab["tabRenderer"]["content"]["richGridRenderer"]["contents"]
+
+            # This is the json tree structure of visitor data
+            # It is necessary to send the visitorData together with the continuation token
+            self._visitor_data = initial_data["responseContext"]["webResponseContextExtensionData"][
+                "ytConfigData"]["visitorData"]
+
         except (KeyError, IndexError, TypeError):
             try:
                 # this is the json tree structure, if the json was directly sent
@@ -188,22 +252,102 @@ def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]:
             # if there is an error, no continuation is available
             continuation = None
 
+        # only extract the video ids from the video data
+        videos_url = []
+        try:
+            # Extract id from videos and live
+            for x in videos:
+                videos_url.append(f"/watch?v="
+                                  f"{x['richItemRenderer']['content']['videoRenderer']['videoId']}")
+        except (KeyError, IndexError, TypeError):
+            # Extract id from short videos
+            for x in videos:
+                videos_url.append(f"/watch?v="
+                                  f"{x['richItemRenderer']['content']['reelItemRenderer']['videoId']}")
+
         # remove duplicates
-        return (
-            uniqueify(
-                list(
-                    # only extract the video ids from the video data
-                    map(
-                        lambda x: (
-                            f"/watch?v="
-                            f"{x['gridVideoRenderer']['videoId']}"
-                        ),
-                        videos
-                    )
-                ),
-            ),
-            continuation,
-        )
+        return uniqueify(videos_url), continuation
+
+    @property
+    def views(self) -> int:
+        """Extract view count for channel.
+
+        :return: Channel view count
+        :rtype: int
+        """
+        self.html_url = self.about_url
+        # Get the position of the "about" tab.
+        about_tab_pos = len(self.initial_data['contents']['twoColumnBrowseResultsRenderer']['tabs']) - 2
+        try:
+            views_text = self.initial_data['contents']['twoColumnBrowseResultsRenderer']['tabs'][about_tab_pos][
+                'tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer'][
+                'contents'][0]['channelAboutFullMetadataRenderer']['viewCountText']['simpleText']
+
+            # "1,234,567 view"
+            count_text = views_text.split(' ')[0]
+            # "1234567"
+            count_text = count_text.replace(',', '')
+            return int(count_text)
+        except KeyError:
+            return 0
+
+    @property
+    @cache
+    def title(self) -> str:
+        """Extract the channel title.
+
+        :return: Channel title (name)
+        :rtype: str
+        """
+        self.html_url = self.channel_url
+        return self.initial_data['metadata']['channelMetadataRenderer']['title']
+
+    @property
+    def description(self) -> str:
+        """Extract the channel description.
+
+        :return: Channel description
+        :rtype: str
+        """
+        self.html_url = self.channel_url
+        return self.initial_data['metadata']['channelMetadataRenderer']['description']
+
+    @property
+    def length(self):
+        """Extracts the approximate amount of videos from the channel.
+
+        :return: Channel videos count
+        :rtype: str
+        """
+        self.html_url = self.channel_url
+        return self.initial_data['header']['c4TabbedHeaderRenderer']['videosCountText']['runs'][0]['text']
+
+    @property
+    @cache
+    def last_updated(self):
+        """Extract the date of the last uploaded video.
+
+        :return: Last video uploaded
+        :rtype: str
+        """
+        self.html_url = self.videos_url
+        try:
+            last_updated_text = self.initial_data['contents']['twoColumnBrowseResultsRenderer']['tabs'][1][
+                'tabRenderer']['content']['richGridRenderer']['contents'][0]['richItemRenderer']['content'][
+                'videoRenderer']['publishedTimeText']['simpleText']
+            return last_updated_text
+        except KeyError:
+            return None
+
+    @property
+    def thumbnail_url(self) -> str:
+        """extract the profile image from the json of the channel home page
+
+        :rtype: str
+        :return: a string with the url of the channel's profile image
+        """
+        self.html_url = self.channel_url  # get the url of the channel home page
+        return self.initial_data['metadata']['channelMetadataRenderer']['avatar']['thumbnails'][0]['url']
 
     @property
     def playlists(self) -> Iterable[Playlist]:
@@ -307,6 +451,36 @@ def playlist_urls(self) -> DeferredGeneratorList:
         """
         return DeferredGeneratorList(self.playlist_url_generator())
 
+    @property
+    def videos(self) -> Iterable[YouTube]:
+        """Yields YouTube objects of videos in this channel
+
+        :rtype: List[YouTube]
+        :returns: List of YouTube
+        """
+        self.html_url = self.videos_url  # Set video tab
+        return DeferredGeneratorList(self.videos_generator())
+
+    @property
+    def shorts(self) -> Iterable[YouTube]:
+        """Yields YouTube objects of short videos in this channel
+
+       :rtype: List[YouTube]
+       :returns: List of YouTube
+       """
+        self.html_url = self.shorts_url  # Set shorts tab
+        return DeferredGeneratorList(self.videos_generator())
+
+    @property
+    def live(self) -> Iterable[YouTube]:
+        """Yields YouTube objects of live in this channel
+
+       :rtype: List[YouTube]
+       :returns: List of YouTube
+       """
+        self.html_url = self.live_url  # Set stream tab
+        return DeferredGeneratorList(self.videos_generator())
+
     @property
     def as_dict(self):
         return {'channelId': self.channel_id,

diff --git a/tutubo/pytube/extract.py b/tutubo/pytube/extract.py
@@ -177,8 +177,10 @@ def channel_name(url: str) -> str:
     if function_match:
         logger.debug("finished regex search, matched: %s", pattern)
         uri_style = function_match.group(1)
-        uri_style = uri_style if uri_style else "c"
         uri_identifier = function_match.group(2)
+        if "@" in url:
+            return f"/@{uri_identifier}"
+        uri_style = uri_style if uri_style else "c"
         return f'/{uri_style}/{uri_identifier}'
 
     raise RegexMatchError(