From 372c27f2e57847b7dc5a230a3e3e3d347317f267 Mon Sep 17 00:00:00 2001 From: Asi Greenholts <88270351+TupleType@users.noreply.github.com> Date: Fri, 19 Jul 2024 21:04:34 +0300 Subject: [PATCH] community[minor]: [GoogleApiYoutubeLoader] Replace API used in _get_document_for_channel from search to playlistItem (#24034) - **Description:** Search has a limit of 500 results, playlistItems doesn't. Added a class in except clause to catch another common error. - **Issue:** None - **Dependencies:** None - **Twitter handle:** @TupleType --------- Co-authored-by: asi-cider <88270351+asi-cider@users.noreply.github.com> Co-authored-by: Eugene Yurtsev --- .../document_loaders/youtube.py | 29 ++++++++++++------- libs/community/scripts/lint_imports.sh | 2 +- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/youtube.py b/libs/community/langchain_community/document_loaders/youtube.py index ccab3f7227e22..bfe1cb0b1391e 100644 --- a/libs/community/langchain_community/document_loaders/youtube.py +++ b/libs/community/langchain_community/document_loaders/youtube.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Any, Dict, Generator, List, Optional, Sequence, Union from urllib.parse import parse_qs, urlparse +from xml.etree.ElementTree import ParseError # OK: trusted-source from langchain_core.documents import Document from langchain_core.pydantic_v1 import root_validator @@ -28,6 +29,8 @@ class GoogleApiClient: As the google api expects credentials you need to set up a google account and register your Service. "https://developers.google.com/docs/api/quickstart/python" + *Security Note*: Note that parsing of the transcripts relies on the standard + xml library but the input is viewed as trusted in this case. Example: @@ -437,6 +440,14 @@ def _get_channel_id(self, channel_name: str) -> str: channel_id = response["items"][0]["id"]["channelId"] return channel_id + def _get_uploads_playlist_id(self, channel_id: str) -> str: + request = self.youtube_client.channels().list( + part="contentDetails", + id=channel_id, + ) + response = request.execute() + return response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"] + def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]: try: from youtube_transcript_api import ( @@ -452,10 +463,11 @@ def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Documen ) channel_id = self._get_channel_id(channel) - request = self.youtube_client.search().list( + uploads_playlist_id = self._get_uploads_playlist_id(channel_id) + request = self.youtube_client.playlistItems().list( part="id,snippet", - channelId=channel_id, - maxResults=50, # adjust this value to retrieve more or fewer videos + playlistId=uploads_playlist_id, + maxResults=50, ) video_ids = [] while request is not None: @@ -463,23 +475,20 @@ def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Documen # Add each video ID to the list for item in response["items"]: - if not item["id"].get("videoId"): - continue - meta_data = {"videoId": item["id"]["videoId"]} + video_id = item["snippet"]["resourceId"]["videoId"] + meta_data = {"videoId": video_id} if self.add_video_info: item["snippet"].pop("thumbnails") meta_data.update(item["snippet"]) try: - page_content = self._get_transcripe_for_video_id( - item["id"]["videoId"] - ) + page_content = self._get_transcripe_for_video_id(video_id) video_ids.append( Document( page_content=page_content, metadata=meta_data, ) ) - except (TranscriptsDisabled, NoTranscriptFound) as e: + except (TranscriptsDisabled, NoTranscriptFound, ParseError) as e: if self.continue_on_failure: logger.error( "Error fetching transscript " diff --git a/libs/community/scripts/lint_imports.sh b/libs/community/scripts/lint_imports.sh index 5f2575f03fbee..d25f68139477a 100755 --- a/libs/community/scripts/lint_imports.sh +++ b/libs/community/scripts/lint_imports.sh @@ -29,7 +29,7 @@ fi # is very nuanced and depends on the user's environment. # https://docs.python.org/3/library/xml.etree.elementtree.html -result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in" || true) +result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in| # OK: trusted-source" || true) if [ -n "$result" ]; then echo "ERROR: The following lines need to be updated:"