diff --git a/cmd/redpanda-connect/main.go b/cmd/redpanda-connect/main.go
index 7408fb4b4b..5b8013602f 100644
--- a/cmd/redpanda-connect/main.go
+++ b/cmd/redpanda-connect/main.go
@@ -12,8 +12,9 @@ import (
)
var (
- Version string
- DateBuilt string
+ Version string
+ DateBuilt string
+ BinaryName string = "redpanda-connect"
)
func redpandaTopLevelConfigField() *service.ConfigField {
@@ -26,6 +27,7 @@ func main() {
service.RunCLI(
context.Background(),
service.CLIOptSetVersion(Version, DateBuilt),
+ service.CLIOptSetBinaryName(BinaryName),
service.CLIOptSetProductName("Redpanda Connect"),
service.CLIOptSetDocumentationURL("https://docs.redpanda.com/redpanda-connect"),
service.CLIOptSetMainSchemaFrom(func() *service.ConfigSchema {
diff --git a/internal/impl/html/bloblang.go b/internal/impl/html/bloblang.go
new file mode 100644
index 0000000000..7d10a7d4b2
--- /dev/null
+++ b/internal/impl/html/bloblang.go
@@ -0,0 +1,55 @@
+package html
+
+import (
+ "fmt"
+
+ "github.com/microcosm-cc/bluemonday"
+ "github.com/redpanda-data/benthos/v4/public/bloblang"
+)
+
+func init() {
+ stripHTMLSpec := bloblang.NewPluginSpec().
+ Category("String Manipulation").
+ Description(`Attempts to remove all HTML tags from a target string.`).
+ Example("", `root.stripped = this.value.strip_html()`,
+ [2]string{
+ `{"value":"
the plain old text
"}`,
+ `{"stripped":"the plain old text"}`,
+ }).
+ Example("It's also possible to provide an explicit list of element types to preserve in the output.",
+ `root.stripped = this.value.strip_html(["article"])`,
+ [2]string{
+ `{"value":"the plain old text
"}`,
+ `{"stripped":"the plain old text"}`,
+ }).
+ Param(bloblang.NewAnyParam("preserve").Description("An optional array of element types to preserve in the output.").Optional())
+
+ if err := bloblang.RegisterMethodV2(
+ "strip_html", stripHTMLSpec,
+ func(args *bloblang.ParsedParams) (bloblang.Method, error) {
+ p := bluemonday.NewPolicy()
+
+ var tags []any
+ if rawArgs := args.AsSlice(); len(rawArgs) > 0 {
+ tags, _ = rawArgs[0].([]any)
+ }
+
+ if len(tags) > 0 {
+ tagStrs := make([]string, len(tags))
+ for i, ele := range tags {
+ var ok bool
+ if tagStrs[i], ok = ele.(string); !ok {
+ return nil, fmt.Errorf("invalid arg at index %v: expected string, got %T", i, ele)
+ }
+ }
+ p = p.AllowElements(tagStrs...)
+ }
+
+ return bloblang.StringMethod(func(s string) (any, error) {
+ return p.Sanitize(s), nil
+ }), nil
+ },
+ ); err != nil {
+ panic(err)
+ }
+}
diff --git a/internal/impl/html/bloblang_test.go b/internal/impl/html/bloblang_test.go
new file mode 100644
index 0000000000..2e929d0de5
--- /dev/null
+++ b/internal/impl/html/bloblang_test.go
@@ -0,0 +1,36 @@
+package html
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+
+ "github.com/redpanda-data/benthos/v4/public/bloblang"
+)
+
+func TestStripHTMLNoArgs(t *testing.T) {
+ e, err := bloblang.Parse(`root = this.strip_html()`)
+ require.NoError(t, err)
+
+ res, err := e.Query(`meow
`)
+ require.NoError(t, err)
+
+ assert.Equal(t, "meow", res)
+}
+
+func TestStripHTMLWithArgs(t *testing.T) {
+ e, err := bloblang.Parse(`root = this.strip_html(["strong","h1"])`)
+ require.NoError(t, err)
+
+ res, err := e.Query(`
+
meow
+
hello world this is some text.
+
`)
+ require.NoError(t, err)
+
+ assert.Equal(t, `
+ meow
+ hello world this is some text.
+`, res)
+}
diff --git a/public/components/pure/extended/package.go b/public/components/pure/extended/package.go
index 4163daeb5a..0418318746 100644
--- a/public/components/pure/extended/package.go
+++ b/public/components/pure/extended/package.go
@@ -13,6 +13,7 @@ import (
_ "github.com/redpanda-data/benthos/v4/public/components/pure/extended"
_ "github.com/redpanda-data/connect/v4/internal/impl/awk"
+ _ "github.com/redpanda-data/connect/v4/internal/impl/html"
_ "github.com/redpanda-data/connect/v4/internal/impl/jsonpath"
_ "github.com/redpanda-data/connect/v4/internal/impl/lang"
_ "github.com/redpanda-data/connect/v4/internal/impl/msgpack"