Skip to content

Commit

Permalink
feat(scraper): add external Web Scraping service
Browse files Browse the repository at this point in the history
  • Loading branch information
ncarlier committed May 1, 2020
1 parent 49aac4e commit 7034bba
Show file tree
Hide file tree
Showing 13 changed files with 215 additions and 96 deletions.
2 changes: 1 addition & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func main() {
}

// Configure the service registry
err = service.Configure(database, userPlans)
err = service.Configure(conf, database, userPlans)
if err != nil {
database.Close()
log.Fatal().Err(err).Msg("could not init service registry")
Expand Down
1 change: 1 addition & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ type Config struct {
SentryDSN string `flag:"sentry-dsn" desc:"Sentry DSN URL"`
ImageProxy string `flag:"image-proxy" desc:"Image proxy service (passthrough if empty)"`
UserPlans string `flag:"user-plans" desc:"User plans definition file (deactivated if empty)"`
WebScraping string `flag:"web-scraping" desc:"Web Scraping service (internal if empty)"`
}
24 changes: 12 additions & 12 deletions pkg/readability/meta.go → pkg/html/meta.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package readability
package html

import (
"bytes"
Expand All @@ -15,39 +15,39 @@ type Meta struct {
Content string
}

// Metas is the set of meta tags
type Metas map[string]*Meta
// MetaSet is the set of meta tags
type MetaSet map[string]*Meta

// GetContent get first content form keys
func (m Metas) GetContent(keys ...string) *string {
func (m MetaSet) GetContent(keys ...string) string {
for _, key := range keys {
if m[key] != nil {
return &m[key].Content
return m[key].Content
}
}
return nil
return ""
}

// ExtractMetas extracts meta tags from a HTML document.
func ExtractMetas(doc io.Reader) (Metas, error) {
// ExtractMeta extracts meta tags from a HTML document.
func ExtractMeta(doc io.Reader) (MetaSet, error) {
var buf bytes.Buffer
tee := io.TeeReader(doc, &buf)

metas := make(map[string]*Meta)
metaSet := make(map[string]*Meta)
z := html.NewTokenizer(tee)
for {
tt := z.Next()
if tt == html.ErrorToken {
if z.Err() == io.EOF {
return metas, nil
return metaSet, nil
}
return nil, z.Err()
}

t := z.Token()

if t.DataAtom == atom.Head && t.Type == html.EndTagToken {
return metas, nil
return metaSet, nil
}

if t.DataAtom == atom.Meta {
Expand All @@ -69,7 +69,7 @@ func ExtractMetas(doc io.Reader) (Metas, error) {
if meta.Property != "" {
key = meta.Property
}
metas[key] = &meta
metaSet[key] = &meta
}
}
}
16 changes: 8 additions & 8 deletions pkg/readability/test/meta_test.go → pkg/html/test/meta_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (

"github.com/stretchr/testify/assert"

"github.com/ncarlier/readflow/pkg/readability"
"github.com/ncarlier/readflow/pkg/html"
)

var testCase = `<head>
Expand All @@ -20,12 +20,12 @@ var testCase = `<head>
</head>`

func TestExtract(t *testing.T) {
metas, err := readability.ExtractMetas(strings.NewReader(testCase))
meta, err := html.ExtractMeta(strings.NewReader(testCase))
assert.Nil(t, err)
assert.Equal(t, 6, len(metas))
assert.Equal(t, "", metas["og:title"].Name)
assert.Equal(t, "og:title", metas["og:title"].Property)
assert.Equal(t, "test case", metas["og:title"].Content)
assert.Equal(t, "twitter description", *metas.GetContent("twitter:description", "description"))
assert.Equal(t, "iso-8859-1", metas["charset"].Content)
assert.Equal(t, 6, len(meta))
assert.Equal(t, "", meta["og:title"].Name)
assert.Equal(t, "og:title", meta["og:title"].Property)
assert.Equal(t, "test case", meta["og:title"].Content)
assert.Equal(t, "twitter description", meta.GetContent("twitter:description", "description"))
assert.Equal(t, "iso-8859-1", meta["charset"].Content)
}
19 changes: 0 additions & 19 deletions pkg/readability/conv.go

This file was deleted.

74 changes: 74 additions & 0 deletions pkg/scraper/external.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package scraper

import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/url"
"time"

"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
)

type extrenalWebScraper struct {
uri string
logger zerolog.Logger
}

// NewExternalWebScraper create an external web scrapping service
func NewExternalWebScraper(uri string) (WebScraper, error) {
if _, err := url.ParseRequestURI(uri); err != nil {
return nil, fmt.Errorf("invalid Web Scraping service URI: %s", uri)
}
logger := log.With().Str("component", "webscraper").Str("uri", uri).Logger()
logger.Debug().Msg("using external service")

return &extrenalWebScraper{
uri: uri,
logger: logger,
}, nil
}

func (ws extrenalWebScraper) Scrap(ctx context.Context, url string) (*WebPage, error) {
webPage, err := ws.scrap(ctx, url)
if err != nil {
ws.logger.Error().Err(err).Msg("unable to scrap web page with external service, fallback on internal service")
return NewInternalWebScraper().Scrap(ctx, url)
}
return webPage, nil
}

func (ws extrenalWebScraper) scrap(ctx context.Context, url string) (*WebPage, error) {
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", ws.uri, nil)
if err != nil {
return nil, err
}
q := req.URL.Query()
q.Add("u", url)
req.URL.RawQuery = q.Encode()

ws.logger.Debug().Str("url", url).Msg("scraping webpage")
res, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
defer res.Body.Close()

if res.StatusCode >= 400 {
return nil, fmt.Errorf("invalid web scraping response: %d", res.StatusCode)
}

if ct := res.Header.Get("Content-Type"); ct != "" {
if ct != "application/json" {
return nil, fmt.Errorf("invalid web scraping Content-Type response: %s", ct)
}
}

webPage := WebPage{}
err = json.NewDecoder(res.Body).Decode(&webPage)
return &webPage, err
}
85 changes: 46 additions & 39 deletions pkg/readability/readability.go → pkg/scraper/internal.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package readability
package scraper

import (
"bytes"
Expand All @@ -11,31 +11,21 @@ import (
"time"

read "github.com/go-shiori/go-readability"
"github.com/ncarlier/readflow/pkg/model"
"github.com/ncarlier/readflow/pkg/html"
"github.com/ncarlier/readflow/pkg/tooling"
"golang.org/x/net/html/charset"
)

func getContentType(ctx context.Context, url string) (string, error) {
ctx, _ = context.WithTimeout(ctx, 10*time.Second)
req, _ := http.NewRequest("HEAD", url, nil)
req = req.WithContext(ctx)
res, err := http.DefaultClient.Do(req)
if err != nil {
return "", err
}
return res.Header.Get("Content-type"), nil
}
type internalWebScraper struct{}

func get(ctx context.Context, url string) (*http.Response, error) {
ctx, _ = context.WithTimeout(ctx, 10*time.Second)
req, _ := http.NewRequest("GET", url, nil)
req = req.WithContext(ctx)
return http.DefaultClient.Do(req)
// NewInternalWebScraper create an internal web scrapping service
func NewInternalWebScraper() WebScraper {
return &internalWebScraper{}
}

// FetchArticle fetch article from an URL
func FetchArticle(ctx context.Context, url string) (*model.Article, error) {
func (ws internalWebScraper) Scrap(ctx context.Context, url string) (*WebPage, error) {
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
// Validate URL
_, err := nurl.ParseRequestURI(url)
if err != nil {
Expand Down Expand Up @@ -63,20 +53,17 @@ func FetchArticle(ctx context.Context, url string) (*model.Article, error) {
return nil, err
}

// Extract metas
metas, err := ExtractMetas(body)
// Extract meta
meta, err := html.ExtractMeta(body)
if err != nil {
return nil, err
}

// Create article with Open Graph atributes
result := &model.Article{
Text: metas.GetContent("og:description", "twitter:description", "description"),
Image: metas.GetContent("og:image", "twitter:image"),
}
title := metas.GetContent("og:title")
if title != nil {
result.Title = *title
// Create article with Open Graph attributes
result := &WebPage{
Title: meta.GetContent("og:title"),
Text: meta.GetContent("og:description", "twitter:description", "description"),
Image: meta.GetContent("og:image", "twitter:image"),
}

var buffer bytes.Buffer
Expand All @@ -94,23 +81,43 @@ func FetchArticle(ctx context.Context, url string) (*model.Article, error) {
}

// Complete result with extracted properties
result.HTML = &article.Content
result.HTML = article.Content
if result.Title == "" {
result.Title = article.Title
}
if result.Text == nil {
if result.Text == "" {
// FIXME: readability excerpt don't well support UTF8
text := tooling.ToUTF8(article.Excerpt)
result.Text = &text
result.Text = tooling.ToUTF8(article.Excerpt)
}
if result.Image == nil {
result.Image = &article.Image
if result.Image == "" {
result.Image = article.Image
}

// TODO: add other properties to the result
// article.Favicon
// article.Length
// article.SiteName
result.Favicon = article.Favicon
result.Length = article.Length
result.SiteName = article.SiteName

return result, nil
}

func getContentType(ctx context.Context, url string) (string, error) {
req, err := http.NewRequest("HEAD", url, nil)
if err != nil {
return "", err
}
req = req.WithContext(ctx)
res, err := http.DefaultClient.Do(req)
if err != nil {
return "", err
}
return res.Header.Get("Content-type"), nil
}

func get(ctx context.Context, url string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req = req.WithContext(ctx)
return http.DefaultClient.Do(req)
}
30 changes: 30 additions & 0 deletions pkg/scraper/scraper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package scraper

import (
"context"
)

// WebPage is the result of a web scraping
type WebPage struct {
Title string `json:"title,omitempty"`
HTML string `json:"html,omitempty"`
Text string `json:"text,omitempty"`
Length int `json:"length,omitempty"`
Excerpt string `json:"excerpt,omitempty"`
SiteName string `json:"sitename,omitempty"`
Image string `json:"image,omitempty"`
Favicon string `json:"favicon,omitempty"`
}

// WebScraper is an interface with Web Scrapping provider
type WebScraper interface {
Scrap(ctx context.Context, url string) (*WebPage, error)
}

// NewWebScraper create new Web Scraping service
func NewWebScraper(uri string) (WebScraper, error) {
if uri == "" {
return NewInternalWebScraper(), nil
}
return NewExternalWebScraper(uri)
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@ import (

"github.com/stretchr/testify/assert"

"github.com/ncarlier/readflow/pkg/readability"
"github.com/ncarlier/readflow/pkg/scraper"
)

func TestFetchNonReadablePage(t *testing.T) {
func TestInternalScrape(t *testing.T) {
ctx := context.TODO()
article, err := readability.FetchArticle(ctx, "https://about.readflow.app/")
page, err := scraper.NewInternalWebScraper().Scrap(ctx, "https://about.readflow.app/")
assert.NotNil(t, err)
assert.Equal(t, "unable to extract content from HTML page", err.Error())
assert.NotNil(t, article)
assert.Equal(t, "readflow", article.Title)
assert.Equal(t, "Read your Internet article flow in one place with complete peace of mind and freedom", *article.Text)
assert.Equal(t, "https://about.readflow.app/images/readflow.png", *article.Image)
assert.NotNil(t, page)
assert.Equal(t, "readflow", page.Title)
assert.Equal(t, "Read your Internet article flow in one place with complete peace of mind and freedom", page.Text)
assert.Equal(t, "https://about.readflow.app/images/readflow.png", page.Image)
}
Loading

0 comments on commit 7034bba

Please sign in to comment.