Skip to content

Commit

Permalink
add parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
bigspawn committed Dec 10, 2024
1 parent 0dffd76 commit e2d4652
Show file tree
Hide file tree
Showing 6 changed files with 1,282 additions and 53 deletions.
3 changes: 0 additions & 3 deletions internal/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,6 @@ func runAlterPortal(
if err != nil {
return fmt.Errorf("failed to create job: %w", err)
}
if err != nil {
return fmt.Errorf("failed to create job: %w", err)
}
return nil
}

Expand Down
51 changes: 26 additions & 25 deletions internal/file_host.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,36 +5,37 @@ import "strings"
const (
alterportalHost = "alterportal.net"
getrockmusicHost = "getrockmusic.net"
coreradioHost = "coreradio.online"
)

var fileHosts = []string{
"mediafire.com",
"zippyshare.com",
"mega.nz",
"solidfiles.com",
"drive.google.com",
"files.mail.ru",
"disk.yandex.ru",
"yadi.sk",
"files.fm",
"uppit.com",
"filecrypt.cc",
"turb.cc",
"turbobit.net",
"coreradio.ru",
alterportalHost,
getrockmusicHost,
"turb.pw",
"krakenfiles.com",
"trbbt.net",
"drive.google.com",
"megaup.net",
"1fichier.com",
"cloud.mail.ru",
var fileHosts = map[string]struct{}{
"1fichier.com": {},
"cloud.mail.ru": {},
"coreradio.ru": {},
"disk.yandex.ru": {},
"drive.google.com": {},
"filecrypt.cc": {},
"files.fm": {},
"files.mail.ru": {},
"krakenfiles.com": {},
"mediafire.com": {},
"mega.nz": {},
"megaup.net": {},
"solidfiles.com": {},
"trbbt.net": {},
"turb.cc": {},
"turb.pw": {},
"turbobit.net": {},
"uppit.com": {},
"yadi.sk": {},
"zippyshare.com": {},
alterportalHost: {},
coreradioHost: {},
getrockmusicHost: {},
}

func isAllowedFileHost(host string) bool {
for _, s := range fileHosts {
for s, _ := range fileHosts {
if strings.Contains(host, s) {
return true
}
Expand Down
102 changes: 78 additions & 24 deletions internal/parser_coreradio.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@ import (
"context"
"encoding/base64"
"fmt"
"io"
"net/http"
"net/url"
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/go-pkgz/lgr"
"github.com/mmcdole/gofeed"
)

const CoreRadioParserRssURL = "https://coreradio.ru/rss.xml"
const CoreRadioParserRssURL = "https://coreradio.online/rss.xml"

type CoreRadioParser struct {
Client *http.Client
Expand All @@ -31,48 +33,87 @@ func (p *CoreRadioParser) Parse(ctx context.Context, item *gofeed.Item) (*News,
return nil, fmt.Errorf("coreradio: response is not 200 OK: status=%s, link=%s", resp.Status, item.Link)
}

news := NewNewsFromItem(item)
news, err := NewNewsFromItem(item)
if err != nil {
return nil, fmt.Errorf("coreradio: failed to create news from item: %w", err)
}

return ParseHtml(ctx, p.Lgr, news, resp.Body)
}

doc, err := goquery.NewDocumentFromReader(resp.Body)
func ParseHtml(ctx context.Context, l lgr.L, news *News, r io.Reader) (*News, error) {
doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return nil, fmt.Errorf("coreradio: NewDocumentFromReader: %w", err)
}

// image link
var ok bool
news.ImageLink, ok = doc.
imageLink, ok := doc.
Find("#dle-content > div.full-news > div.full-news-top > div.full-news-left > center > a > img").
Attr("src")
if !ok {
itemDoc, err := goquery.NewDocumentFromReader(bytes.NewBufferString(item.Description))
if ok {
il, err := url.Parse(imageLink)
if err != nil {
return nil, fmt.Errorf("coreradio: NewDocumentFromReader: Description: %w", err)
return nil, fmt.Errorf("coreradio: parse image link: %w", err)
}

news.ImageLink, ok = itemDoc.Find("img[src]").Attr("src")
if !ok {
return nil, fmt.Errorf("coreradio: Find: image: not found: link=%s", item.Link)
if il.Query().Get("url") != "" {
imageLink, err = DecodeBase64(il.Query().Get("url"))
if err != nil {
return nil, fmt.Errorf("coreradio: decode image base64: %w", err)
}

imageLink = ExtractAfterDecode(imageLink)
}

news.ImageLink = WebpToPng(imageLink)
}
news.ImageLink = WebpToPng(news.ImageLink)

// download link
doc.
Find("#dle-content > div.full-news > div.full-news-top > div.full-news-right > center > div").
Find(".quotel").
Find("a[href]").
Each(DownloadLinkSelector(news))

if len(news.DownloadLink) == 0 {
return nil, ErrSkipItem
return nil, fmt.Errorf("coreradio: download link not found: %w", ErrSkipItem)
}

var links []string
for i := range news.DownloadLink {
if !strings.Contains(news.DownloadLink[i], engineSuffix) {
l.Logf("[INFO] skip wrong link for parser: %s", news.DownloadLink[i])
continue
}

l.Logf("[DEBUG] link: %s\n", news.DownloadLink[i])

link, err := DecodeBase64(ExtractLink(news.DownloadLink[i]))
if err != nil {
return nil, fmt.Errorf("coreradio: DecodeBase64: link=%s", item.Link)
return nil, fmt.Errorf("coreradio: DecodeBase64: link=%s: %w", news.DownloadLink[i], err)
}

l.Logf("[DEBUG] decoded link: %s\n", link)

purl, err := url.ParseQuery(link)
if err != nil {
return nil, fmt.Errorf("coreradio: ParseQuery: link=%s: %w", link, err)
}
news.DownloadLink[i] = ExtractAfterDecode(link)

l.Logf("[DEBUG] parsed link: %s\n", purl)

if purl.Get("url") == "" {
l.Logf("[INFO] skip wrong link for parser: %s", news.DownloadLink[i])
continue
}

ll := ExtractAfterDecode(purl.Get("url"))

l.Logf("[DEBUG] extracted link: %s\n", ll)

links = append(links, ll)
}
news.DownloadLink = links

// text
content := doc.Find("#dle-content > div.full-news > div.full-news-top > div.full-news-right > div.full-news-info")
Expand Down Expand Up @@ -106,19 +147,30 @@ func (p *CoreRadioParser) Parse(ctx context.Context, item *gofeed.Item) (*News,
}
news.Text = strings.TrimSpace(b.String())

if isSkippedGenre(p.Lgr, news.Text) {
return nil, ErrSkipItem
if isSkippedGenre(l, news.Text) {
return nil, fmt.Errorf("coreradio: genre must be skipped: %w", ErrSkipItem)
}

return news, nil
}

func NewNewsFromItem(item *gofeed.Item) *News {
return &News{
Title: strings.TrimSpace(item.Title),
PageLink: item.Link,
DateTime: *item.PublishedParsed,
func NewNewsFromItem(item *gofeed.Item) (*News, error) {
itemDoc, err := goquery.NewDocumentFromReader(bytes.NewBufferString(item.Description))
if err != nil {
return nil, fmt.Errorf("coreradio: NewDocumentFromReader: Description: %w", err)
}

imageLink, ok := itemDoc.Find("img[src]").Attr("src")
if !ok {
return nil, fmt.Errorf("coreradio: Find: image: not found: link=%s", item.Link)
}

return &News{
Title: strings.TrimSpace(item.Title),
PageLink: item.Link,
DateTime: *item.PublishedParsed,
ImageLink: WebpToPng(imageLink),
}, nil
}

func GetPage(ctx context.Context, client *http.Client, link string) (*http.Response, error) {
Expand Down Expand Up @@ -147,9 +199,11 @@ func DecodeBase64(s string) (string, error) {
return string(b), nil
}

const engineSuffix = "/engine/go.php?url="

func ExtractLink(s string) string {
const (
engineURL = "https://coreradio.ru/engine/go.php?url="
engineURL = "https://" + coreradioHost + engineSuffix
equalSymbol = "%3D"
slash = "%2F"
slashLen = len(slash)
Expand Down
12 changes: 12 additions & 0 deletions internal/parser_coreradio_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package internal

import (
"bytes"
"context"
_ "embed"
"net/http"
"os"
"testing"
Expand Down Expand Up @@ -127,3 +129,13 @@ func TestExtractAfterDecode(t *testing.T) {
})
}
}

//go:embed testdata/core_radio.html
var data []byte

func TestParseHtml(t *testing.T) {
n, err := ParseHtml(context.Background(), lgr.NoOp, &News{}, bytes.NewReader(data))
require.NoError(t, err)

t.Logf("%v\n", n)
}
5 changes: 4 additions & 1 deletion internal/parser_getrockmusic.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ type GetRockMusicParser struct {
}

func (p *GetRockMusicParser) Parse(ctx context.Context, item *gofeed.Item) (*News, error) {
news := NewNewsFromItem(item)
news, err := NewNewsFromItem(item)
if err != nil {
return nil, fmt.Errorf("failed to create news from item: %w", err)
}

req, err := http.NewRequestWithContext(ctx, http.MethodGet, item.Link, nil)
if err != nil {
Expand Down
1,162 changes: 1,162 additions & 0 deletions internal/testdata/core_radio.html

Large diffs are not rendered by default.

0 comments on commit e2d4652

Please sign in to comment.