Skip to content

Commit

Permalink
feat(scraper): add Youtube content provider support
Browse files Browse the repository at this point in the history
  • Loading branch information
ncarlier committed Jan 28, 2021
1 parent 4638418 commit 0dd623e
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 16 deletions.
22 changes: 22 additions & 0 deletions pkg/scraper/content-provider.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package scraper

import "context"

// ContentProvider is a content provider interface
type ContentProvider interface {
Get(ctx context.Context, rawurl string) (*WebPage, error)
Match(url string) bool
}

// ContentProviders is the registry of all supported content provider
var ContentProviders = map[string]ContentProvider{}

// GetContentProvider return content provider that match the given URL
func GetContentProvider(rawurl string) ContentProvider {
for _, v := range ContentProviders {
if v.Match(rawurl) {
return v
}
}
return nil
}
6 changes: 6 additions & 0 deletions pkg/scraper/content-provider/all/all.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package all

import (
// activate youtube content provider support
_ "github.com/ncarlier/readflow/pkg/scraper/content-provider/youtube"
)
101 changes: 101 additions & 0 deletions pkg/scraper/content-provider/youtube/youtube.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package youtube

import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"path"
"regexp"

"github.com/ncarlier/readflow/pkg/constant"
"github.com/ncarlier/readflow/pkg/scraper"
)

var ytWRe = regexp.MustCompile(`https://.+\.youtube.com/watch.+`)
var ytLRe = regexp.MustCompile(`https://.+\.youtube.com/v/.+`)
var ytSRe = regexp.MustCompile(`https://youtu.be/.+`)

const ytOEmbedBaseURL = "https://www.youtube.com/oembed?maxheight=600&maxwidth=800&format=json&url="

type oEmbedResponse struct {
Title string `json:"title,omitempty"`
HTML string `json:"html,omitempty"`
ThumbnailURL string `json:"thumbnail_url,omitempty"`
AuthorName string `json:"author_name,omitempty"`
}

type youtubeContentProvider struct {
httpClient *http.Client
}

func newYoutubeContentProvider() *youtubeContentProvider {
return &youtubeContentProvider{
httpClient: &http.Client{
Timeout: constant.DefaultTimeout,
},
}
}

func (ycp youtubeContentProvider) Get(ctx context.Context, rawurl string) (*scraper.WebPage, error) {
oembedURL := ytOEmbedBaseURL + rawurl

req, err := http.NewRequest("GET", oembedURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", constant.UserAgent)
req = req.WithContext(ctx)
res, err := ycp.httpClient.Do(req)
if err != nil || res.StatusCode >= 300 {
if err == nil {
err = fmt.Errorf("bad status code: %d", res.StatusCode)
}
return nil, err
}
if res.Body != nil {
defer res.Body.Close()
}

body, err := ioutil.ReadAll(res.Body)
if err != nil {
return nil, err
}

oembed := oEmbedResponse{}
err = json.Unmarshal(body, &oembed)
if err != nil {
return nil, err
}

return &scraper.WebPage{
Title: oembed.Title,
HTML: oembed.HTML,
Image: oembed.ThumbnailURL,
URL: rawurl,
Text: "Youtube video from " + oembed.AuthorName,
SiteName: "Youtube",
}, nil
}

func (ycp youtubeContentProvider) Match(url string) bool {
return ytWRe.MatchString(url) || ytLRe.MatchString(url) || ytSRe.MatchString(url)
}

func getYoutubeVideoID(rawurl string) string {
u, err := url.Parse(rawurl)
if err != nil {
return ""
}
if ytWRe.MatchString(rawurl) {
q := u.Query()
return q.Get("v")
}
return path.Base(u.Path)
}

func init() {
scraper.ContentProviders["youtube"] = newYoutubeContentProvider()
}
26 changes: 26 additions & 0 deletions pkg/scraper/content-provider/youtube/youtube_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package youtube

import (
"context"
"testing"

"github.com/stretchr/testify/assert"

"github.com/ncarlier/readflow/pkg/scraper"
)

func TestYoutubeContentProvider(t *testing.T) {
ctx := context.TODO()

rawurl := "https://www.youtube.com/watch?v=ee-LhNZPZ1U"

provider := scraper.GetContentProvider(rawurl)
assert.NotNil(t, provider, "content provider not found")
assert.True(t, provider.Match(rawurl))

page, err := provider.Get(ctx, rawurl)
assert.NoError(t, err)
assert.NotNil(t, page)
assert.Equal(t, "Youtube", page.SiteName)
assert.Equal(t, "Deus Ex Silicium : Les Circuits Intégrés", page.Title)
}
12 changes: 6 additions & 6 deletions pkg/scraper/external.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,27 +32,27 @@ func NewExternalWebScraper(uri string) (WebScraper, error) {
}, nil
}

func (ws extrenalWebScraper) Scrap(ctx context.Context, url string) (*WebPage, error) {
webPage, err := ws.scrap(ctx, url)
func (ws extrenalWebScraper) Scrap(ctx context.Context, rawurl string) (*WebPage, error) {
webPage, err := ws.scrap(ctx, rawurl)
if err != nil {
ws.logger.Error().Err(err).Msg("unable to scrap web page with external service, fallback on internal service")
return NewInternalWebScraper().Scrap(ctx, url)
return NewInternalWebScraper().Scrap(ctx, rawurl)
}
return webPage, nil
}

func (ws extrenalWebScraper) scrap(ctx context.Context, url string) (*WebPage, error) {
func (ws extrenalWebScraper) scrap(ctx context.Context, rawurl string) (*WebPage, error) {
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", ws.uri, nil)
if err != nil {
return nil, err
}
q := req.URL.Query()
q.Add("u", url)
q.Add("u", rawurl)
req.URL.RawQuery = q.Encode()

ws.logger.Debug().Str("url", url).Msg("scraping webpage")
ws.logger.Debug().Str("url", rawurl).Msg("scraping webpage")
res, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
Expand Down
24 changes: 15 additions & 9 deletions pkg/scraper/internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"fmt"
"io"
"net/http"
nurl "net/url"
"net/url"
"strings"
"time"

Expand All @@ -30,17 +30,23 @@ func NewInternalWebScraper() WebScraper {
}
}

func (ws internalWebScraper) Scrap(ctx context.Context, url string) (*WebPage, error) {
func (ws internalWebScraper) Scrap(ctx context.Context, rawurl string) (*WebPage, error) {
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
// Validate URL
_, err := nurl.ParseRequestURI(url)
_, err := url.ParseRequestURI(rawurl)
if err != nil {
return nil, fmt.Errorf("invalid URL: %v", err)
}

// Get content provider for this URL
contentProvider := GetContentProvider(rawurl)
if contentProvider != nil {
return contentProvider.Get(ctx, rawurl)
}

// Get URL content type
contentType, err := ws.getContentType(ctx, url)
contentType, err := ws.getContentType(ctx, rawurl)
if err != nil {
return nil, err
}
Expand All @@ -50,7 +56,7 @@ func (ws internalWebScraper) Scrap(ctx context.Context, url string) (*WebPage, e
}

// Get URL content
res, err := ws.get(ctx, url)
res, err := ws.get(ctx, rawurl)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -112,8 +118,8 @@ func (ws internalWebScraper) Scrap(ctx context.Context, url string) (*WebPage, e
return result, nil
}

func (ws internalWebScraper) getContentType(ctx context.Context, url string) (string, error) {
req, err := http.NewRequest("HEAD", url, nil)
func (ws internalWebScraper) getContentType(ctx context.Context, rawurl string) (string, error) {
req, err := http.NewRequest("HEAD", rawurl, nil)
if err != nil {
return "", err
}
Expand All @@ -126,8 +132,8 @@ func (ws internalWebScraper) getContentType(ctx context.Context, url string) (st
return res.Header.Get("Content-type"), nil
}

func (ws internalWebScraper) get(ctx context.Context, url string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
func (ws internalWebScraper) get(ctx context.Context, rawurl string) (*http.Response, error) {
req, err := http.NewRequest("GET", rawurl, nil)
if err != nil {
return nil, err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ type WebPage struct {

// WebScraper is an interface with Web Scrapping provider
type WebScraper interface {
Scrap(ctx context.Context, url string) (*WebPage, error)
Scrap(ctx context.Context, rawurl string) (*WebPage, error)
}

// NewWebScraper create new Web Scraping service
Expand Down
3 changes: 3 additions & 0 deletions pkg/service/articles_create.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ import (

"github.com/ncarlier/readflow/pkg/event"
"github.com/ncarlier/readflow/pkg/model"

// activate all content providers
_ "github.com/ncarlier/readflow/pkg/scraper/content-provider/all"
)

// ArticleCreationOptions article creation options
Expand Down

0 comments on commit 0dd623e

Please sign in to comment.