From 372c27f2e57847b7dc5a230a3e3e3d347317f267 Mon Sep 17 00:00:00 2001
From: Asi Greenholts <88270351+TupleType@users.noreply.github.com>
Date: Fri, 19 Jul 2024 21:04:34 +0300
Subject: [PATCH] community[minor]:  [GoogleApiYoutubeLoader] Replace API used
 in _get_document_for_channel from search to playlistItem (#24034)

- **Description:** Search has a limit of 500 results, playlistItems
doesn't. Added a class in except clause to catch another common error.
- **Issue:** None
- **Dependencies:** None
- **Twitter handle:** @TupleType

---------

Co-authored-by: asi-cider <88270351+asi-cider@users.noreply.github.com>
Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
---
 .../document_loaders/youtube.py               | 29 ++++++++++++-------
 libs/community/scripts/lint_imports.sh        |  2 +-
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/youtube.py b/libs/community/langchain_community/document_loaders/youtube.py
index ccab3f7227e22..bfe1cb0b1391e 100644
--- a/libs/community/langchain_community/document_loaders/youtube.py
+++ b/libs/community/langchain_community/document_loaders/youtube.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 from typing import Any, Dict, Generator, List, Optional, Sequence, Union
 from urllib.parse import parse_qs, urlparse
+from xml.etree.ElementTree import ParseError  # OK: trusted-source
 
 from langchain_core.documents import Document
 from langchain_core.pydantic_v1 import root_validator
@@ -28,6 +29,8 @@ class GoogleApiClient:
     As the google api expects credentials you need to set up a google account and
     register your Service. "https://developers.google.com/docs/api/quickstart/python"
 
+    *Security Note*: Note that parsing of the transcripts relies on the standard
+        xml library but the input is viewed as trusted in this case.
 
 
     Example:
@@ -437,6 +440,14 @@ def _get_channel_id(self, channel_name: str) -> str:
         channel_id = response["items"][0]["id"]["channelId"]
         return channel_id
 
+    def _get_uploads_playlist_id(self, channel_id: str) -> str:
+        request = self.youtube_client.channels().list(
+            part="contentDetails",
+            id=channel_id,
+        )
+        response = request.execute()
+        return response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
+
     def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
         try:
             from youtube_transcript_api import (
@@ -452,10 +463,11 @@ def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Documen
             )
 
         channel_id = self._get_channel_id(channel)
-        request = self.youtube_client.search().list(
+        uploads_playlist_id = self._get_uploads_playlist_id(channel_id)
+        request = self.youtube_client.playlistItems().list(
             part="id,snippet",
-            channelId=channel_id,
-            maxResults=50,  # adjust this value to retrieve more or fewer videos
+            playlistId=uploads_playlist_id,
+            maxResults=50,
         )
         video_ids = []
         while request is not None:
@@ -463,23 +475,20 @@ def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Documen
 
             # Add each video ID to the list
             for item in response["items"]:
-                if not item["id"].get("videoId"):
-                    continue
-                meta_data = {"videoId": item["id"]["videoId"]}
+                video_id = item["snippet"]["resourceId"]["videoId"]
+                meta_data = {"videoId": video_id}
                 if self.add_video_info:
                     item["snippet"].pop("thumbnails")
                     meta_data.update(item["snippet"])
                 try:
-                    page_content = self._get_transcripe_for_video_id(
-                        item["id"]["videoId"]
-                    )
+                    page_content = self._get_transcripe_for_video_id(video_id)
                     video_ids.append(
                         Document(
                             page_content=page_content,
                             metadata=meta_data,
                         )
                     )
-                except (TranscriptsDisabled, NoTranscriptFound) as e:
+                except (TranscriptsDisabled, NoTranscriptFound, ParseError) as e:
                     if self.continue_on_failure:
                         logger.error(
                             "Error fetching transscript "
diff --git a/libs/community/scripts/lint_imports.sh b/libs/community/scripts/lint_imports.sh
index 5f2575f03fbee..d25f68139477a 100755
--- a/libs/community/scripts/lint_imports.sh
+++ b/libs/community/scripts/lint_imports.sh
@@ -29,7 +29,7 @@ fi
 # is very nuanced and depends on the user's environment.
 # https://docs.python.org/3/library/xml.etree.elementtree.html
 
-result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in" || true)
+result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in| # OK: trusted-source" || true)
 
 if [ -n "$result" ]; then
   echo "ERROR: The following lines need to be updated:"