Skip to content

Commit

Permalink
feat: integrate transcripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Davincible authored and corny committed May 4, 2023
1 parent cd26881 commit 3d2aef9
Show file tree
Hide file tree
Showing 8 changed files with 339 additions and 70 deletions.
86 changes: 63 additions & 23 deletions client.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"log"
"math/rand"
"net/http"
"net/url"
"strconv"
"sync"
)
Expand All @@ -24,6 +25,9 @@ var (
ErrNoFormat = errors.New("no video format provided")
)

// DefaultClient type to use. No reason to change but you could if you wanted to.
var DefaultClient = AndroidClient

// Client offers methods to download video metadata and video streams.
type Client struct {
// Debug enables debugging output through log package
Expand All @@ -47,6 +51,12 @@ type Client struct {
consentID string
}

func (c *Client) assureClient() {
if c.client == nil {
c.client = &DefaultClient
}
}

// GetVideo fetches video metadata
func (c *Client) GetVideo(url string) (*Video, error) {
return c.GetVideoContext(context.Background(), url)
Expand All @@ -63,7 +73,7 @@ func (c *Client) GetVideoContext(ctx context.Context, url string) (*Video, error
}

func (c *Client) videoFromID(ctx context.Context, id string) (*Video, error) {
c.client = &androidClient
c.assureClient()

body, err := c.videoDataByInnertube(ctx, id)
if err != nil {
Expand Down Expand Up @@ -92,7 +102,7 @@ func (c *Client) videoFromID(ctx context.Context, id string) (*Video, error) {

// If the uploader marked the video as inappropriate for some ages, use embed player
if errors.Is(err, ErrLoginRequired) {
c.client = &embeddedClient
c.client = &EmbeddedClient

bodyEmbed, errEmbed := c.videoDataByInnertube(ctx, id)
if errEmbed == nil {
Expand Down Expand Up @@ -121,9 +131,9 @@ type innertubeRequest struct {
BrowseID string `json:"browseId,omitempty"`
Continuation string `json:"continuation,omitempty"`
Context inntertubeContext `json:"context"`
PlaybackContext playbackContext `json:"playbackContext,omitempty"`
ContentCheckOK bool `json:"contentCheckOk"`
racyCheckOk bool `json:"racyCheckOk"`
PlaybackContext *playbackContext `json:"playbackContext,omitempty"`
ContentCheckOK bool `json:"contentCheckOk,omitempty"`
RacyCheckOk bool `json:"racyCheckOk,omitempty"`
Params string `json:"params"`
}

Expand All @@ -133,7 +143,7 @@ type playbackContext struct {

type contentPlaybackContext struct {
// SignatureTimestamp string `json:"signatureTimestamp"`
html5Preference string `json:"html5Preference"`
HTML5Preference string `json:"html5Preference"`
}

type inntertubeContext struct {
Expand Down Expand Up @@ -161,23 +171,25 @@ type clientInfo struct {
}

var (
// might add ANDROID and other in future, but i don't see reason yet
webClient = clientInfo{
// WebClient, better to use Android client but go ahead.
WebClient = clientInfo{
name: "WEB",
version: "2.20210617.01.00",
key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8",
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}

androidClient = clientInfo{
// AndroidClient, download go brrrrrr.
AndroidClient = clientInfo{
name: "ANDROID",
version: "17.31.35",
key: "AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w",
userAgent: "com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip",
androidVersion: 30,
}

embeddedClient = clientInfo{
// EmbeddedClient, not really tested.
EmbeddedClient = clientInfo{
name: "WEB_EMBEDDED_PLAYER",
version: "1.19700101",
key: "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", // seems like same key works for both clients
Expand All @@ -190,19 +202,28 @@ func (c *Client) videoDataByInnertube(ctx context.Context, id string) ([]byte, e
VideoID: id,
Context: prepareInnertubeContext(*c.client),
ContentCheckOK: true,
racyCheckOk: true,
RacyCheckOk: true,
Params: "8AEB",
PlaybackContext: playbackContext{
PlaybackContext: &playbackContext{
ContentPlaybackContext: contentPlaybackContext{
// SignatureTimestamp: sts,
html5Preference: "HTML5_PREF_WANTS",
HTML5Preference: "HTML5_PREF_WANTS",
},
},
}

return c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/player?key="+c.client.key, data)
}

func (c *Client) transcriptDataByInnertube(ctx context.Context, id string) ([]byte, error) {
data := innertubeRequest{
Context: prepareInnertubeContext(*c.client),
Params: transcriptVideoID(id),
}

return c.httpPostBodyBytes(ctx, "https://www.youtube.com/youtubei/v1/get_transcript?key="+c.client.key, data)
}

func prepareInnertubeContext(clientInfo clientInfo) inntertubeContext {
return inntertubeContext{
Client: innertubeClient{
Expand All @@ -225,7 +246,7 @@ func prepareInnertubePlaylistData(ID string, continuation bool, clientInfo clien
Context: context,
Continuation: ID,
ContentCheckOK: true,
racyCheckOk: true,
RacyCheckOk: true,
Params: "8AEB",
}
}
Expand All @@ -234,11 +255,27 @@ func prepareInnertubePlaylistData(ID string, continuation bool, clientInfo clien
Context: context,
BrowseID: "VL" + ID,
ContentCheckOK: true,
racyCheckOk: true,
RacyCheckOk: true,
Params: "8AEB",
}
}

// transcriptVideoID encodes the video ID to the param used to fetch transcripts.
func transcriptVideoID(videoID string) string {
langCode := encTranscriptLang("en")

// This can be optionally appened to the Sprintf str, not sure what it means
// *3engagement-panel-searchable-transcript-search-panel\x30\x00\x38\x01\x40\x01
return base64Enc(fmt.Sprintf("\n\x0b%s\x12\x12%s\x18\x01", videoID, langCode))
}

func encTranscriptLang(languageCode string) string {
s := fmt.Sprintf("\n\x03asr\x12\x02%s\x1a\x00", languageCode)
s = base64PadEnc(s)

return url.QueryEscape(s)
}

// GetPlaylist fetches playlist metadata
func (c *Client) GetPlaylist(url string) (*Playlist, error) {
return c.GetPlaylistContext(context.Background(), url)
Expand All @@ -248,7 +285,7 @@ func (c *Client) GetPlaylist(url string) (*Playlist, error) {
// for these videos. Playlist entries cannot be downloaded, as they lack all the required metadata, but
// can be used to enumerate all IDs, Authors, Titles, etc.
func (c *Client) GetPlaylistContext(ctx context.Context, url string) (*Playlist, error) {
c.client = &androidClient
c.assureClient()

id, err := extractPlaylistID(url)
if err != nil {
Expand Down Expand Up @@ -319,8 +356,7 @@ func (c *Client) GetStreamContext(ctx context.Context, video *Video, format *For
func (c *Client) downloadOnce(req *http.Request, w *io.PipeWriter, _ *Format) int64 {
resp, err := c.httpDo(req)
if err != nil {
//nolint:errcheck
w.CloseWithError(err)
w.CloseWithError(err) //nolint:errcheck
return 0
}

Expand All @@ -330,8 +366,7 @@ func (c *Client) downloadOnce(req *http.Request, w *io.PipeWriter, _ *Format) in
if err == nil {
w.Close()
} else {
//nolint:errcheck
w.CloseWithError(err)
w.CloseWithError(err) //nolint:errcheck
}
}()

Expand Down Expand Up @@ -447,8 +482,11 @@ func (c *Client) GetStreamURLContext(ctx context.Context, video *Video, format *
}

if format.URL != "" {
return format.URL, nil
// return c.unThrottle(ctx, video.ID, format.URL)
if c.client.androidVersion > 0 {
return format.URL, nil
}

return c.unThrottle(ctx, video.ID, format.URL)
}

// TODO: check rest of this function, is it redundant?
Expand Down Expand Up @@ -482,7 +520,7 @@ func (c *Client) httpDo(req *http.Request) (*http.Response, error) {
req.Header.Set("Sec-Fetch-Mode", "navigate")

if len(c.consentID) == 0 {
c.consentID = strconv.Itoa(rand.Intn(899) + 100)
c.consentID = strconv.Itoa(rand.Intn(899) + 100) //nolint:gosec
}

req.AddCookie(&http.Cookie{
Expand Down Expand Up @@ -517,6 +555,7 @@ func (c *Client) httpGet(ctx context.Context, url string) (*http.Response, error
resp.Body.Close()
return nil, ErrUnexpectedStatusCode(resp.StatusCode)
}

return resp, nil
}

Expand Down Expand Up @@ -557,6 +596,7 @@ func (c *Client) httpPost(ctx context.Context, url string, body interface{}) (*h
resp.Body.Close()
return nil, ErrUnexpectedStatusCode(resp.StatusCode)
}

return resp, nil
}

Expand Down
9 changes: 0 additions & 9 deletions decipher.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,3 @@ func (config playerConfig) parseDecipherOps() (operations []DecipherOperation, e
}
return ops, nil
}

func (config playerConfig) getSignatureTimestamp() (string, error) {
result := signatureRegexp.FindSubmatch(config)
if result == nil {
return "", ErrSignatureTimestampNotFound
}

return string(result[1]), nil
}
4 changes: 0 additions & 4 deletions player_parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@ type playerConfig []byte

var basejsPattern = regexp.MustCompile(`(/s/player/\w+/player_ias.vflset/\w+/base.js)`)

// we may use \d{5} instead of \d+ since currently its 5 digits, but i can't be sure it will be 5 digits always
var signatureRegexp = regexp.MustCompile(`(?m)(?:^|,)(?:signatureTimestamp:)(\d+)`)

func (c *Client) getPlayerConfig(ctx context.Context, videoID string) (playerConfig, error) {

embedURL := fmt.Sprintf("https://youtube.com/embed/%s?hl=en", videoID)
embedBody, err := c.httpGetBodyBytes(ctx, embedURL)
if err != nil {
Expand Down
21 changes: 10 additions & 11 deletions playlist.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,14 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [

metadata = metadata.Get("playlistHeaderRenderer")

p.Title = getText(metadata, "title")
p.Description = getText(metadata, "description", "descriptionText")
p.Title = sjsonGetText(metadata, "title")
p.Description = sjsonGetText(metadata, "description", "descriptionText")
p.Author = j.GetPath("sidebar", "playlistSidebarRenderer", "items").GetIndex(1).
GetPath("playlistSidebarSecondaryInfoRenderer", "videoOwner", "videoOwnerRenderer", "title", "runs").
GetIndex(0).Get("text").MustString()

if len(p.Author) == 0 {
p.Author = getText(metadata, "owner", "ownerText")
p.Author = sjsonGetText(metadata, "owner", "ownerText")
}

contents, ok := j.CheckGet("contents")
Expand All @@ -110,11 +110,11 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [
}

// contents can have different keys with same child structure
firstPart := getFistKey(contents).GetPath("tabs").GetIndex(0).
firstPart := getFirstKeyJSON(contents).GetPath("tabs").GetIndex(0).
GetPath("tabRenderer", "content", "sectionListRenderer", "contents").GetIndex(0)

// This extra nested item is only set with the web client
if n := firstPart.GetPath("itemSectionRenderer", "contents").GetIndex(0); isValid(n) {
if n := firstPart.GetPath("itemSectionRenderer", "contents").GetIndex(0); isValidJSON(n) {
firstPart = n
}

Expand Down Expand Up @@ -155,12 +155,11 @@ func (p *Playlist) parsePlaylistInfo(ctx context.Context, client *Client, body [
return err
}

var next *sjson.Json
if next = j.GetPath("onResponseReceivedActions").GetIndex(0).
GetPath("appendContinuationItemsAction", "continuationItems"); isValid(next) {
} else if next = j.GetPath("continuationContents", "playlistVideoListContinuation", "contents"); isValid(next) {
} else {
return fmt.Errorf("failed to extract continuation data")
next := j.GetPath("onResponseReceivedActions").GetIndex(0).
GetPath("appendContinuationItemsAction", "continuationItems")

if !isValidJSON(next) {
next = j.GetPath("continuationContents", "playlistVideoListContinuation", "contents")
}

vJSON, err := next.MarshalJSON()
Expand Down
Loading

0 comments on commit 3d2aef9

Please sign in to comment.