From bf0b42b863aafffd7258b8c8754a1e1ca8e17b42 Mon Sep 17 00:00:00 2001 From: Teodor Yanev Date: Thu, 8 Feb 2024 22:52:10 +0200 Subject: [PATCH 1/8] add: homoglyphs detection --- docs/docs/ref/proto.md | 49 + internal/engine/eval/eval.go | 3 + .../application/homoglyphs_service.go | 48 + .../application/invisible_characters_eval.go | 109 + .../application/mixed_scripts_eval.go | 114 + .../eval/homoglyphs/communication/reviewer.go | 163 + .../homoglyphs/domain/invisible_characters.go | 45 + .../domain/invisible_characters_test.go | 60 + .../eval/homoglyphs/domain/mixed_scripts.go | 152 + .../homoglyphs/domain/mixed_scripts_test.go | 160 + .../invisible_characters_dictionary.go | 82 + .../homoglyphs/domain/resources/scripts.txt | 3033 ++++++++++++ .../eval/homoglyphs/util/gh_text_review.go | 70 + internal/engine/ingester/diff/diff.go | 127 +- pkg/api/openapi/minder/v1/minder.swagger.json | 66 +- pkg/api/protobuf/go/minder/v1/minder.pb.go | 4284 +++++++++-------- pkg/api/protobuf/go/minder/v1/minder.pb.gw.go | 138 +- pkg/api/protobuf/go/minder/v1/rule_types.go | 3 + pkg/api/protobuf/go/minder/v1/validators.go | 15 +- proto/minder/v1/minder.proto | 23 + 20 files changed, 6657 insertions(+), 2087 deletions(-) create mode 100644 internal/engine/eval/homoglyphs/application/homoglyphs_service.go create mode 100644 internal/engine/eval/homoglyphs/application/invisible_characters_eval.go create mode 100644 internal/engine/eval/homoglyphs/application/mixed_scripts_eval.go create mode 100644 internal/engine/eval/homoglyphs/communication/reviewer.go create mode 100644 internal/engine/eval/homoglyphs/domain/invisible_characters.go create mode 100644 internal/engine/eval/homoglyphs/domain/invisible_characters_test.go create mode 100644 internal/engine/eval/homoglyphs/domain/mixed_scripts.go create mode 100644 internal/engine/eval/homoglyphs/domain/mixed_scripts_test.go create mode 100644 internal/engine/eval/homoglyphs/domain/resources/invisible_characters_dictionary.go create mode 100644 internal/engine/eval/homoglyphs/domain/resources/scripts.txt create mode 100644 internal/engine/eval/homoglyphs/util/gh_text_review.go diff --git a/docs/docs/ref/proto.md b/docs/docs/ref/proto.md index ec5d82edb1..5cbe963d40 100644 --- a/docs/docs/ref/proto.md +++ b/docs/docs/ref/proto.md @@ -911,6 +911,43 @@ ListRuleTypesResponse is the response to list rule types. | rule_types | [RuleType](#minder-v1-RuleType) | repeated | rule_types is the list of rule types. | + + +#### PrContents + + + +| Field | Type | Label | Description | +| ----- | ---- | ----- | ----------- | +| pr | [PullRequest](#minder-v1-PullRequest) | | | +| files | [PrContents.File](#minder-v1-PrContents-File) | repeated | | + + + + +#### PrContents.File + + + +| Field | Type | Label | Description | +| ----- | ---- | ----- | ----------- | +| name | [string](#string) | | | +| file_patch_url | [string](#string) | | | +| patch_lines | [PrContents.File.Line](#minder-v1-PrContents-File-Line) | repeated | | + + + + +#### PrContents.File.Line + + + +| Field | Type | Label | Description | +| ----- | ---- | ----- | ----------- | +| line_number | [int32](#int32) | | | +| content | [string](#string) | | | + + #### PrDependencies @@ -1313,6 +1350,18 @@ endpoint and how we compare it to the rule. | rego | [RuleType.Definition.Eval.Rego](#minder-v1-RuleType-Definition-Eval-Rego) | optional | rego is only used if the `rego` type is selected. | | vulncheck | [RuleType.Definition.Eval.Vulncheck](#minder-v1-RuleType-Definition-Eval-Vulncheck) | optional | vulncheck is only used if the `vulncheck` type is selected. | | trusty | [RuleType.Definition.Eval.Trusty](#minder-v1-RuleType-Definition-Eval-Trusty) | optional | trusty is only used if the `trusty` type is selected. | +| homoglyphs | [RuleType.Definition.Eval.Homoglyphs](#minder-v1-RuleType-Definition-Eval-Homoglyphs) | optional | homoglyphs is only used if the `homoglyphs` type is selected. | + + + + +#### RuleType.Definition.Eval.Homoglyphs + + + +| Field | Type | Label | Description | +| ----- | ---- | ----- | ----------- | +| type | [string](#string) | | | diff --git a/internal/engine/eval/eval.go b/internal/engine/eval/eval.go index 36b1b980da..e8bd5cb6ce 100644 --- a/internal/engine/eval/eval.go +++ b/internal/engine/eval/eval.go @@ -21,6 +21,7 @@ import ( "fmt" "os" + "github.com/stacklok/minder/internal/engine/eval/homoglyphs/application" "github.com/stacklok/minder/internal/engine/eval/jq" "github.com/stacklok/minder/internal/engine/eval/rego" "github.com/stacklok/minder/internal/engine/eval/trusty" @@ -58,6 +59,8 @@ func NewRuleEvaluator(rt *pb.RuleType, cli *providers.ProviderBuilder) (engif.Ev trustyEvalConfig.Endpoint = os.Getenv("MINDER_UNSTABLE_TRUSTY_ENDPOINT") } return trusty.NewTrustyEvaluator(trustyEvalConfig, cli) + case application.HomoglyphsEvalType: + return application.NewHomoglyphsEvaluator(e.GetHomoglyphs(), cli) default: return nil, fmt.Errorf("unsupported rule type engine: %s", rt.Def.Eval.Type) } diff --git a/internal/engine/eval/homoglyphs/application/homoglyphs_service.go b/internal/engine/eval/homoglyphs/application/homoglyphs_service.go new file mode 100644 index 0000000000..46d4bd3c60 --- /dev/null +++ b/internal/engine/eval/homoglyphs/application/homoglyphs_service.go @@ -0,0 +1,48 @@ +// Copyright 2023 Stacklok, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package application + +import ( + "fmt" + + "github.com/stacklok/minder/internal/engine/interfaces" + "github.com/stacklok/minder/internal/providers" + pb "github.com/stacklok/minder/pkg/api/protobuf/go/minder/v1" +) + +const ( + HomoglyphsEvalType = "homoglyphs" + + invisibleCharacters = "invisible_characters" + mixedScript = "mixed_scripts" +) + +func NewHomoglyphsEvaluator(reh *pb.RuleType_Definition_Eval_Homoglyphs, pbuild *providers.ProviderBuilder) (interfaces.Evaluator, error) { + if pbuild == nil { + return nil, fmt.Errorf("provider builder is nil") + } + if reh == nil { + return nil, fmt.Errorf("homoglyphs configuration is nil") + } + + switch reh.Type { + case invisibleCharacters: + return NewInvisibleCharactersEvaluator(pbuild) + case mixedScript: + return NewMixedScriptEvaluator(pbuild) + default: + return nil, fmt.Errorf("unsupported homoglyphs type: %s", reh.Type) + } +} diff --git a/internal/engine/eval/homoglyphs/application/invisible_characters_eval.go b/internal/engine/eval/homoglyphs/application/invisible_characters_eval.go new file mode 100644 index 0000000000..0a1adaafee --- /dev/null +++ b/internal/engine/eval/homoglyphs/application/invisible_characters_eval.go @@ -0,0 +1,109 @@ +// Copyright 2023 Stacklok, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package application + +import ( + "context" + "fmt" + "strings" + + "github.com/google/go-github/v56/github" + + "github.com/stacklok/minder/internal/engine/eval/homoglyphs/communication" + "github.com/stacklok/minder/internal/engine/eval/homoglyphs/domain" + "github.com/stacklok/minder/internal/engine/eval/homoglyphs/util" + engif "github.com/stacklok/minder/internal/engine/interfaces" + "github.com/stacklok/minder/internal/providers" + pb "github.com/stacklok/minder/pkg/api/protobuf/go/minder/v1" +) + +// InvisibleCharactersEvaluator is an evaluator for the invisible characters rule type +type InvisibleCharactersEvaluator struct { + processor *domain.InvisibleCharactersProcessor + reviewHandler *communication.GhReviewPrHandler +} + +func NewInvisibleCharactersEvaluator(pbuild *providers.ProviderBuilder) (*InvisibleCharactersEvaluator, error) { + if pbuild == nil { + return nil, fmt.Errorf("provider builder is nil") + } + + ghClient, err := pbuild.GetGitHub(context.Background()) + if err != nil { + return nil, fmt.Errorf("could not fetch GitHub client: %w", err) + } + + return &InvisibleCharactersEvaluator{ + processor: domain.NewInvisibleCharactersProcessor(), + reviewHandler: communication.NewGhReviewPrHandler(ghClient), + }, nil +} + +func (ice *InvisibleCharactersEvaluator) Eval(ctx context.Context, _ map[string]any, res *engif.Result) error { + if res == nil { + return fmt.Errorf("result is nil") + } + + //nolint:govet + prContents, ok := res.Object.(pb.PrContents) + if !ok { + return fmt.Errorf("invalid object type for homoglyphs evaluator") + } + + if prContents.Pr == nil || prContents.Files == nil { + return fmt.Errorf("invalid prContents fields: %v, %v", prContents.Pr, prContents.Files) + } + + if len(prContents.Files) == 0 { + return nil + } + + // Note: This is a mandatory step to reassign certain fields in the handler. + // This is a workaround to avoid recreating the object. + ice.reviewHandler.Hydrate(ctx, prContents.Pr) + + for _, file := range prContents.Files { + for _, line := range file.PatchLines { + invisibleCharactersFound := ice.processor.FindInvisibleCharacters(line.Content) + if len(invisibleCharactersFound) == 0 { + continue + } + + var commentBody strings.Builder + commentBody.WriteString("**Invisible Characters Found:**\n\n") + + for _, r := range invisibleCharactersFound { + commentBody.WriteString(fmt.Sprintf("- `%U` \n", r)) + } + + reviewComment := &github.DraftReviewComment{ + Path: github.String(file.Name), + Body: github.String(commentBody.String()), + Line: github.Int(int(line.LineNumber)), + } + + ice.reviewHandler.AddComment(reviewComment) + } + } + + var reviewText string + if len(ice.reviewHandler.GetComments()) > 0 { + reviewText = util.InvisibleCharsFoundText + } else { + reviewText = util.NoInvisibleCharsFoundText + } + + return ice.reviewHandler.SubmitReview(ctx, reviewText) +} diff --git a/internal/engine/eval/homoglyphs/application/mixed_scripts_eval.go b/internal/engine/eval/homoglyphs/application/mixed_scripts_eval.go new file mode 100644 index 0000000000..581ccd90a2 --- /dev/null +++ b/internal/engine/eval/homoglyphs/application/mixed_scripts_eval.go @@ -0,0 +1,114 @@ +// Copyright 2023 Stacklok, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package application + +import ( + "context" + "fmt" + "strings" + + "github.com/google/go-github/v56/github" + + "github.com/stacklok/minder/internal/engine/eval/homoglyphs/communication" + "github.com/stacklok/minder/internal/engine/eval/homoglyphs/domain" + "github.com/stacklok/minder/internal/engine/eval/homoglyphs/util" + engif "github.com/stacklok/minder/internal/engine/interfaces" + "github.com/stacklok/minder/internal/providers" + pb "github.com/stacklok/minder/pkg/api/protobuf/go/minder/v1" +) + +// MixedScriptsEvaluator is the evaluator for the mixed scripts rule type +type MixedScriptsEvaluator struct { + processor *domain.MixedScriptsProcessor + reviewHandler *communication.GhReviewPrHandler +} + +func NewMixedScriptEvaluator(pbuild *providers.ProviderBuilder) (*MixedScriptsEvaluator, error) { + if pbuild == nil { + return nil, fmt.Errorf("provider builder is nil") + } + + ghClient, err := pbuild.GetGitHub(context.Background()) + if err != nil { + return nil, fmt.Errorf("could not fetch GitHub client: %w", err) + } + + msProcessor, err := domain.NewMixedScriptsProcessor() + if err != nil { + return nil, fmt.Errorf("could not create mixed scripts processor: %w", err) + } + + return &MixedScriptsEvaluator{ + processor: msProcessor, + reviewHandler: communication.NewGhReviewPrHandler(ghClient), + }, nil +} + +func (mse *MixedScriptsEvaluator) Eval(ctx context.Context, pol map[string]any, res *engif.Result) error { + if res == nil { + return fmt.Errorf("result is nil") + } + + //nolint:govet + prContents, ok := res.Object.(pb.PrContents) + if !ok { + return fmt.Errorf("invalid object type for homoglyphs evaluator") + } + + if prContents.Pr == nil || prContents.Files == nil { + return fmt.Errorf("invalid prContents fields: %v, %v", prContents.Pr, prContents.Files) + } + + if len(prContents.Files) == 0 { + return nil + } + + // Note: This is a mandatory step to reassign certain fields in the handler. + // This is a workaround to avoid recreating the object. + mse.reviewHandler.Hydrate(ctx, prContents.Pr) + + for _, file := range prContents.Files { + for _, line := range file.PatchLines { + mixedScriptsInfo := mse.processor.FindMixedScripts(line.Content) + if len(mixedScriptsInfo) == 0 { + continue + } + + var commentBody strings.Builder + commentBody.WriteString("**Mixed Scripts Detected:**\n\n") + + for _, info := range mixedScriptsInfo { + commentBody.WriteString(fmt.Sprintf("- Text: `%s`, Scripts: %v\n", info.Text, info.ScriptsFound)) + } + + reviewComment := &github.DraftReviewComment{ + Path: github.String(file.Name), + Body: github.String(commentBody.String()), + Line: github.Int(int(line.LineNumber)), + } + + mse.reviewHandler.AddComment(reviewComment) + } + } + + var reviewText string + if len(mse.reviewHandler.GetComments()) > 0 { + reviewText = util.MixedScriptsFoundText + } else { + reviewText = util.NoMixedScriptsFoundText + } + + return mse.reviewHandler.SubmitReview(ctx, reviewText) +} diff --git a/internal/engine/eval/homoglyphs/communication/reviewer.go b/internal/engine/eval/homoglyphs/communication/reviewer.go new file mode 100644 index 0000000000..76a1a3ce2a --- /dev/null +++ b/internal/engine/eval/homoglyphs/communication/reviewer.go @@ -0,0 +1,163 @@ +// Copyright 2023 Stacklok, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package communication + +import ( + "context" + "fmt" + "strings" + + "github.com/google/go-github/v56/github" + "github.com/rs/zerolog" + + "github.com/stacklok/minder/internal/engine/eval/homoglyphs/util" + pb "github.com/stacklok/minder/pkg/api/protobuf/go/minder/v1" + provifv1 "github.com/stacklok/minder/pkg/providers/v1" +) + +type GhReviewPrHandler struct { + logger zerolog.Logger + + ghClient provifv1.GitHub + pr *pb.PullRequest + + minderReview *github.PullRequestReview + comments []*github.DraftReviewComment +} + +func NewGhReviewPrHandler(ghClient provifv1.GitHub) *GhReviewPrHandler { + return &GhReviewPrHandler{ + ghClient: ghClient, + } +} + +func (ra *GhReviewPrHandler) SubmitReview(ctx context.Context, reviewText string) error { + if err := ra.findPreviousReview(ctx); err != nil { + return fmt.Errorf("could not find previous review: %w", err) + } + + if ra.minderReview != nil { + if ra.minderReview.CommitID != nil && *ra.minderReview.CommitID == ra.pr.CommitSha { + // if the previous review was on the same commit, keep it + ra.logger.Debug(). + Int64("review-id", ra.minderReview.GetID()). + Msg("previous review was on the same commit, will keep it") + return nil + } + + err := ra.dismissReview(ctx) + if err != nil { + ra.logger.Error().Err(err). + Int64("review-id", ra.minderReview.GetID()). + Msg("could not dismiss previous review") + } + ra.logger.Debug(). + Int64("review-id", ra.minderReview.GetID()). + Msg("dismissed previous review") + } + + if err := ra.submitReview(ctx, reviewText); err != nil { + return fmt.Errorf("could not submit review: %w", err) + } + ra.logger.Debug().Msg("submitted review") + return nil +} + +func (ra *GhReviewPrHandler) Hydrate(ctx context.Context, pr *pb.PullRequest) { + logger := zerolog.Ctx(ctx).With(). + Int32("pull-number", pr.Number). + Str("repo-owner", pr.RepoOwner). + Str("repo-name", pr.RepoName). + Logger() + + ra.logger = logger + ra.pr = pr + ra.comments = make([]*github.DraftReviewComment, 0) + ra.minderReview = nil +} + +func (ra *GhReviewPrHandler) AddComment(comment *github.DraftReviewComment) { + ra.comments = append(ra.comments, comment) +} + +func (ra *GhReviewPrHandler) GetComments() []*github.DraftReviewComment { + return ra.comments +} + +func (ra *GhReviewPrHandler) findPreviousReview(ctx context.Context) error { + reviews, err := ra.ghClient.ListReviews(ctx, ra.pr.RepoOwner, ra.pr.RepoName, int(ra.pr.Number), nil) + if err != nil { + return fmt.Errorf("could not list reviews: %w", err) + } + + ra.minderReview = nil + for _, r := range reviews { + if strings.HasPrefix(r.GetBody(), util.ReviewBodyMagicComment) && r.GetState() != "DISMISSED" { + ra.minderReview = r + break + } + } + + return nil +} + +func (ra *GhReviewPrHandler) submitReview(ctx context.Context, reviewText string) error { + body, err := util.CreateReviewBody(reviewText) + if err != nil { + return fmt.Errorf("could not create review body: %w", err) + } + + review := &github.PullRequestReviewRequest{ + CommitID: github.String(ra.pr.CommitSha), + Event: github.String("COMMENT"), + Comments: ra.comments, + Body: github.String(body), + } + + _, err = ra.ghClient.CreateReview( + ctx, + ra.pr.RepoOwner, + ra.pr.RepoName, + int(ra.pr.Number), + review, + ) + if err != nil { + return fmt.Errorf("could not create review: %w", err) + } + + return nil +} + +func (ra *GhReviewPrHandler) dismissReview(ctx context.Context) error { + if ra.minderReview == nil { + return nil + } + + dismissReview := &github.PullRequestReviewDismissalRequest{ + Message: github.String(util.ReviewBodyDismissCommentText), + } + + _, err := ra.ghClient.DismissReview( + ctx, + ra.pr.RepoOwner, + ra.pr.RepoName, + int(ra.pr.Number), + ra.minderReview.GetID(), + dismissReview) + if err != nil { + return fmt.Errorf("could not dismiss review: %w", err) + } + return nil +} diff --git a/internal/engine/eval/homoglyphs/domain/invisible_characters.go b/internal/engine/eval/homoglyphs/domain/invisible_characters.go new file mode 100644 index 0000000000..04eff64931 --- /dev/null +++ b/internal/engine/eval/homoglyphs/domain/invisible_characters.go @@ -0,0 +1,45 @@ +// Copyright 2023 Stacklok, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package domain contains the domain logic for the homoglyphs rule type +package domain + +import ( + "github.com/stacklok/minder/internal/engine/eval/homoglyphs/domain/resources" +) + +// InvisibleCharactersProcessor is a processor for the invisible characters rule type +type InvisibleCharactersProcessor struct { + invisibleCharacters map[rune]struct{} +} + +// NewInvisibleCharactersProcessor creates a new InvisibleCharactersProcessor +func NewInvisibleCharactersProcessor() *InvisibleCharactersProcessor { + return &InvisibleCharactersProcessor{ + invisibleCharacters: resources.InvisibleCharacters, + } +} + +// FindInvisibleCharacters checks for invisible characters in the given line +// and returns a slice of runes representing the invisible characters found. +func (ice *InvisibleCharactersProcessor) FindInvisibleCharacters(line string) []rune { + invisibleChars := make([]rune, 0) + for _, r := range line { + if _, exists := ice.invisibleCharacters[r]; exists { + invisibleChars = append(invisibleChars, r) + } + } + + return invisibleChars +} diff --git a/internal/engine/eval/homoglyphs/domain/invisible_characters_test.go b/internal/engine/eval/homoglyphs/domain/invisible_characters_test.go new file mode 100644 index 0000000000..ec2bcc3a76 --- /dev/null +++ b/internal/engine/eval/homoglyphs/domain/invisible_characters_test.go @@ -0,0 +1,60 @@ +// Copyright 2023 Stacklok, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package domain + +import ( + "reflect" + "testing" +) + +func TestFindInvisibleCharacters(t *testing.T) { + t.Parallel() + + tests := []struct { + description string + line string + expected []rune + }{ + { + description: "No invisible characters", + line: "Hello, World!", + expected: []rune{}, + }, + { + description: "Contains Zero Width Space", + line: "Hello,\u200BWorld!", + expected: []rune{'\u200B'}, + }, + { + description: "Multiple invisible characters", + line: "Invisible\u200BText\u200C\u200D", + expected: []rune{'\u200B', '\u200C', '\u200D'}, + }, + } + + processor := NewInvisibleCharactersProcessor() + + for _, tt := range tests { + tt := tt + t.Run(tt.description, func(t *testing.T) { + t.Parallel() + + result := processor.FindInvisibleCharacters(tt.line) + if !reflect.DeepEqual(result, tt.expected) { + t.Errorf("FindInvisibleCharacters(%q) = %v, want %v", tt.line, result, tt.expected) + } + }) + } +} diff --git a/internal/engine/eval/homoglyphs/domain/mixed_scripts.go b/internal/engine/eval/homoglyphs/domain/mixed_scripts.go new file mode 100644 index 0000000000..4b15764cc7 --- /dev/null +++ b/internal/engine/eval/homoglyphs/domain/mixed_scripts.go @@ -0,0 +1,152 @@ +// Copyright 2023 Stacklok, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package domain + +import ( + "bufio" + "embed" + "fmt" + "io/fs" + "strings" +) + +//go:embed resources/scripts.txt +var scriptsContent embed.FS + +// MixedScriptsProcessor is a processor for the mixed scripts rule type +type MixedScriptsProcessor struct { + runeToScript map[rune]string +} + +// NewMixedScriptsProcessor creates a new MixedScriptsProcessor +func NewMixedScriptsProcessor() (*MixedScriptsProcessor, error) { + // 7th of Feb, 2024: https://www.unicode.org/Public/UCD/latest/ucd/Scripts.txt + runeToScript, err := loadScriptData("resources/scripts.txt") + if err != nil { + return nil, err + } + + return &MixedScriptsProcessor{ + runeToScript: runeToScript, + }, nil +} + +// MixedScriptInfo contains information about a word that mixes multiple scripts +type MixedScriptInfo struct { + Text string + ScriptsFound []string +} + +// FindMixedScripts returns a slice of MixedScriptInfo for words in the input string that +// mix multiple scripts, ignoring common characters, detecting +// potential obfuscation in text. Words with only common script characters are not flagged. +// E.g. “B. C“ is not considered mixed-scripts by default: it contains characters +// from Latin and Common, but Common is excluded by default. +func (mse *MixedScriptsProcessor) FindMixedScripts(line string) []MixedScriptInfo { + words := strings.Fields(line) + mixedScripts := make([]MixedScriptInfo, 0) + + for _, word := range words { + scriptsFound := make(map[string]struct{}) + for _, r := range word { + script, exists := mse.runeToScript[r] + if !exists || script == "Common" { + continue + } + scriptsFound[script] = struct{}{} + } + + if len(scriptsFound) > 1 { + scripts := make([]string, 0, len(scriptsFound)) + for script := range scriptsFound { + scripts = append(scripts, script) + } + mixedScripts = append(mixedScripts, MixedScriptInfo{ + Text: word, + ScriptsFound: scripts, + }) + } + } + + return mixedScripts +} + +// loadScriptData reads data from the specified file in Scripts.txt format and populates a runeToScript map. +// The function parses each line of the file, ignoring comments and empty lines. +// It expects lines in the format ";