-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(scraper): add Youtube content provider support
- Loading branch information
Showing
8 changed files
with
180 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
package scraper | ||
|
||
import "context" | ||
|
||
// ContentProvider is a content provider interface | ||
type ContentProvider interface { | ||
Get(ctx context.Context, rawurl string) (*WebPage, error) | ||
Match(url string) bool | ||
} | ||
|
||
// ContentProviders is the registry of all supported content provider | ||
var ContentProviders = map[string]ContentProvider{} | ||
|
||
// GetContentProvider return content provider that match the given URL | ||
func GetContentProvider(rawurl string) ContentProvider { | ||
for _, v := range ContentProviders { | ||
if v.Match(rawurl) { | ||
return v | ||
} | ||
} | ||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
package all | ||
|
||
import ( | ||
// activate youtube content provider support | ||
_ "github.com/ncarlier/readflow/pkg/scraper/content-provider/youtube" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
package youtube | ||
|
||
import ( | ||
"context" | ||
"encoding/json" | ||
"fmt" | ||
"io/ioutil" | ||
"net/http" | ||
"net/url" | ||
"path" | ||
"regexp" | ||
|
||
"github.com/ncarlier/readflow/pkg/constant" | ||
"github.com/ncarlier/readflow/pkg/scraper" | ||
) | ||
|
||
var ytWRe = regexp.MustCompile(`https://.+\.youtube.com/watch.+`) | ||
var ytLRe = regexp.MustCompile(`https://.+\.youtube.com/v/.+`) | ||
var ytSRe = regexp.MustCompile(`https://youtu.be/.+`) | ||
|
||
const ytOEmbedBaseURL = "https://www.youtube.com/oembed?maxheight=600&maxwidth=800&format=json&url=" | ||
|
||
type oEmbedResponse struct { | ||
Title string `json:"title,omitempty"` | ||
HTML string `json:"html,omitempty"` | ||
ThumbnailURL string `json:"thumbnail_url,omitempty"` | ||
AuthorName string `json:"author_name,omitempty"` | ||
} | ||
|
||
type youtubeContentProvider struct { | ||
httpClient *http.Client | ||
} | ||
|
||
func newYoutubeContentProvider() *youtubeContentProvider { | ||
return &youtubeContentProvider{ | ||
httpClient: &http.Client{ | ||
Timeout: constant.DefaultTimeout, | ||
}, | ||
} | ||
} | ||
|
||
func (ycp youtubeContentProvider) Get(ctx context.Context, rawurl string) (*scraper.WebPage, error) { | ||
oembedURL := ytOEmbedBaseURL + rawurl | ||
|
||
req, err := http.NewRequest("GET", oembedURL, nil) | ||
if err != nil { | ||
return nil, err | ||
} | ||
req.Header.Set("User-Agent", constant.UserAgent) | ||
req = req.WithContext(ctx) | ||
res, err := ycp.httpClient.Do(req) | ||
if err != nil || res.StatusCode >= 300 { | ||
if err == nil { | ||
err = fmt.Errorf("bad status code: %d", res.StatusCode) | ||
} | ||
return nil, err | ||
} | ||
if res.Body != nil { | ||
defer res.Body.Close() | ||
} | ||
|
||
body, err := ioutil.ReadAll(res.Body) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
oembed := oEmbedResponse{} | ||
err = json.Unmarshal(body, &oembed) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return &scraper.WebPage{ | ||
Title: oembed.Title, | ||
HTML: oembed.HTML, | ||
Image: oembed.ThumbnailURL, | ||
URL: rawurl, | ||
Text: "Youtube video from " + oembed.AuthorName, | ||
SiteName: "Youtube", | ||
}, nil | ||
} | ||
|
||
func (ycp youtubeContentProvider) Match(url string) bool { | ||
return ytWRe.MatchString(url) || ytLRe.MatchString(url) || ytSRe.MatchString(url) | ||
} | ||
|
||
func getYoutubeVideoID(rawurl string) string { | ||
u, err := url.Parse(rawurl) | ||
if err != nil { | ||
return "" | ||
} | ||
if ytWRe.MatchString(rawurl) { | ||
q := u.Query() | ||
return q.Get("v") | ||
} | ||
return path.Base(u.Path) | ||
} | ||
|
||
func init() { | ||
scraper.ContentProviders["youtube"] = newYoutubeContentProvider() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
package youtube | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
|
||
"github.com/ncarlier/readflow/pkg/scraper" | ||
) | ||
|
||
func TestYoutubeContentProvider(t *testing.T) { | ||
ctx := context.TODO() | ||
|
||
rawurl := "https://www.youtube.com/watch?v=ee-LhNZPZ1U" | ||
|
||
provider := scraper.GetContentProvider(rawurl) | ||
assert.NotNil(t, provider, "content provider not found") | ||
assert.True(t, provider.Match(rawurl)) | ||
|
||
page, err := provider.Get(ctx, rawurl) | ||
assert.NoError(t, err) | ||
assert.NotNil(t, page) | ||
assert.Equal(t, "Youtube", page.SiteName) | ||
assert.Equal(t, "Deus Ex Silicium : Les Circuits Intégrés", page.Title) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters