diff --git a/cmd/redpanda-connect/main.go b/cmd/redpanda-connect/main.go index 7408fb4b4b..5b8013602f 100644 --- a/cmd/redpanda-connect/main.go +++ b/cmd/redpanda-connect/main.go @@ -12,8 +12,9 @@ import ( ) var ( - Version string - DateBuilt string + Version string + DateBuilt string + BinaryName string = "redpanda-connect" ) func redpandaTopLevelConfigField() *service.ConfigField { @@ -26,6 +27,7 @@ func main() { service.RunCLI( context.Background(), service.CLIOptSetVersion(Version, DateBuilt), + service.CLIOptSetBinaryName(BinaryName), service.CLIOptSetProductName("Redpanda Connect"), service.CLIOptSetDocumentationURL("https://docs.redpanda.com/redpanda-connect"), service.CLIOptSetMainSchemaFrom(func() *service.ConfigSchema { diff --git a/internal/impl/html/bloblang.go b/internal/impl/html/bloblang.go new file mode 100644 index 0000000000..7d10a7d4b2 --- /dev/null +++ b/internal/impl/html/bloblang.go @@ -0,0 +1,55 @@ +package html + +import ( + "fmt" + + "github.com/microcosm-cc/bluemonday" + "github.com/redpanda-data/benthos/v4/public/bloblang" +) + +func init() { + stripHTMLSpec := bloblang.NewPluginSpec(). + Category("String Manipulation"). + Description(`Attempts to remove all HTML tags from a target string.`). + Example("", `root.stripped = this.value.strip_html()`, + [2]string{ + `{"value":"

the plain old text

"}`, + `{"stripped":"the plain old text"}`, + }). + Example("It's also possible to provide an explicit list of element types to preserve in the output.", + `root.stripped = this.value.strip_html(["article"])`, + [2]string{ + `{"value":"

the plain old text

"}`, + `{"stripped":"
the plain old text
"}`, + }). + Param(bloblang.NewAnyParam("preserve").Description("An optional array of element types to preserve in the output.").Optional()) + + if err := bloblang.RegisterMethodV2( + "strip_html", stripHTMLSpec, + func(args *bloblang.ParsedParams) (bloblang.Method, error) { + p := bluemonday.NewPolicy() + + var tags []any + if rawArgs := args.AsSlice(); len(rawArgs) > 0 { + tags, _ = rawArgs[0].([]any) + } + + if len(tags) > 0 { + tagStrs := make([]string, len(tags)) + for i, ele := range tags { + var ok bool + if tagStrs[i], ok = ele.(string); !ok { + return nil, fmt.Errorf("invalid arg at index %v: expected string, got %T", i, ele) + } + } + p = p.AllowElements(tagStrs...) + } + + return bloblang.StringMethod(func(s string) (any, error) { + return p.Sanitize(s), nil + }), nil + }, + ); err != nil { + panic(err) + } +} diff --git a/internal/impl/html/bloblang_test.go b/internal/impl/html/bloblang_test.go new file mode 100644 index 0000000000..2e929d0de5 --- /dev/null +++ b/internal/impl/html/bloblang_test.go @@ -0,0 +1,36 @@ +package html + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/redpanda-data/benthos/v4/public/bloblang" +) + +func TestStripHTMLNoArgs(t *testing.T) { + e, err := bloblang.Parse(`root = this.strip_html()`) + require.NoError(t, err) + + res, err := e.Query(`
meow
`) + require.NoError(t, err) + + assert.Equal(t, "meow", res) +} + +func TestStripHTMLWithArgs(t *testing.T) { + e, err := bloblang.Parse(`root = this.strip_html(["strong","h1"])`) + require.NoError(t, err) + + res, err := e.Query(`
+

meow

+

hello world this is some text. +

`) + require.NoError(t, err) + + assert.Equal(t, ` +

meow

+ hello world this is some text. +`, res) +} diff --git a/public/components/pure/extended/package.go b/public/components/pure/extended/package.go index 4163daeb5a..0418318746 100644 --- a/public/components/pure/extended/package.go +++ b/public/components/pure/extended/package.go @@ -13,6 +13,7 @@ import ( _ "github.com/redpanda-data/benthos/v4/public/components/pure/extended" _ "github.com/redpanda-data/connect/v4/internal/impl/awk" + _ "github.com/redpanda-data/connect/v4/internal/impl/html" _ "github.com/redpanda-data/connect/v4/internal/impl/jsonpath" _ "github.com/redpanda-data/connect/v4/internal/impl/lang" _ "github.com/redpanda-data/connect/v4/internal/impl/msgpack"