Skip to content

Commit

Permalink
Add strip_html
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeffail committed May 29, 2024
1 parent 9a212ee commit e6ab281
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 2 deletions.
6 changes: 4 additions & 2 deletions cmd/redpanda-connect/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ import (
)

var (
Version string
DateBuilt string
Version string
DateBuilt string
BinaryName string = "redpanda-connect"
)

func redpandaTopLevelConfigField() *service.ConfigField {
Expand All @@ -26,6 +27,7 @@ func main() {
service.RunCLI(
context.Background(),
service.CLIOptSetVersion(Version, DateBuilt),
service.CLIOptSetBinaryName(BinaryName),
service.CLIOptSetProductName("Redpanda Connect"),
service.CLIOptSetDocumentationURL("https://docs.redpanda.com/redpanda-connect"),
service.CLIOptSetMainSchemaFrom(func() *service.ConfigSchema {
Expand Down
55 changes: 55 additions & 0 deletions internal/impl/html/bloblang.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package html

import (
"fmt"

"github.com/microcosm-cc/bluemonday"
"github.com/redpanda-data/benthos/v4/public/bloblang"
)

func init() {
stripHTMLSpec := bloblang.NewPluginSpec().
Category("String Manipulation").
Description(`Attempts to remove all HTML tags from a target string.`).
Example("", `root.stripped = this.value.strip_html()`,
[2]string{
`{"value":"<p>the plain <strong>old text</strong></p>"}`,
`{"stripped":"the plain old text"}`,
}).
Example("It's also possible to provide an explicit list of element types to preserve in the output.",
`root.stripped = this.value.strip_html(["article"])`,
[2]string{
`{"value":"<article><p>the plain <strong>old text</strong></p></article>"}`,
`{"stripped":"<article>the plain old text</article>"}`,
}).
Param(bloblang.NewAnyParam("preserve").Description("An optional array of element types to preserve in the output.").Optional())

if err := bloblang.RegisterMethodV2(
"strip_html", stripHTMLSpec,
func(args *bloblang.ParsedParams) (bloblang.Method, error) {
p := bluemonday.NewPolicy()

var tags []any
if rawArgs := args.AsSlice(); len(rawArgs) > 0 {
tags, _ = rawArgs[0].([]any)
}

if len(tags) > 0 {
tagStrs := make([]string, len(tags))
for i, ele := range tags {
var ok bool
if tagStrs[i], ok = ele.(string); !ok {
return nil, fmt.Errorf("invalid arg at index %v: expected string, got %T", i, ele)
}
}
p = p.AllowElements(tagStrs...)
}

return bloblang.StringMethod(func(s string) (any, error) {
return p.Sanitize(s), nil
}), nil
},
); err != nil {
panic(err)
}
}
36 changes: 36 additions & 0 deletions internal/impl/html/bloblang_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package html

import (
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/redpanda-data/benthos/v4/public/bloblang"
)

func TestStripHTMLNoArgs(t *testing.T) {
e, err := bloblang.Parse(`root = this.strip_html()`)
require.NoError(t, err)

res, err := e.Query(`<div>meow</div>`)
require.NoError(t, err)

assert.Equal(t, "meow", res)
}

func TestStripHTMLWithArgs(t *testing.T) {
e, err := bloblang.Parse(`root = this.strip_html(["strong","h1"])`)
require.NoError(t, err)

res, err := e.Query(`<div>
<h1>meow</h1>
<p>hello world this is <strong>some</strong> text.
</div>`)
require.NoError(t, err)

assert.Equal(t, `
<h1>meow</h1>
hello world this is <strong>some</strong> text.
`, res)
}
1 change: 1 addition & 0 deletions public/components/pure/extended/package.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
_ "github.com/redpanda-data/benthos/v4/public/components/pure/extended"

_ "github.com/redpanda-data/connect/v4/internal/impl/awk"
_ "github.com/redpanda-data/connect/v4/internal/impl/html"
_ "github.com/redpanda-data/connect/v4/internal/impl/jsonpath"
_ "github.com/redpanda-data/connect/v4/internal/impl/lang"
_ "github.com/redpanda-data/connect/v4/internal/impl/msgpack"
Expand Down

0 comments on commit e6ab281

Please sign in to comment.