Skip to content

Commit

Permalink
Implemented an 'ignore line' content filter
Browse files Browse the repository at this point in the history
  • Loading branch information
crashdump committed Jul 27, 2023
1 parent 049030d commit f5db4d8
Show file tree
Hide file tree
Showing 11 changed files with 245 additions and 68 deletions.
9 changes: 6 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@
.DEFAULT_GOAL=test

build: test
go build -o bin/edmgen/edmgen cmd/edmgen/*.go
go build -v -o bin/edmgen/edmgen cmd/edmgen/*.go

clean:
rm -f bin/edmgen

test:
go mod tidy
staticcheck ./...
go test ./...
go test -v ./...

integration-test:
go test ./... -tags=integration
go test -v ./... -tags=integration

release: test integration-test build
go test all -v
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ While a very useful technology, most DLP solutions lack the tools to sample the
## Install

```bash
go install github.com/crashdump/edmgen
go install github.com/crashdump/edmgen/cmd/edmgen@latest
```

## Use
Expand Down Expand Up @@ -61,6 +61,7 @@ Can be applied to the phases: `ExamineFiles` and `SampleContent`

* `LineLength`: Only select lines based on their length. Min and Max can be specified.
* `LongestLine`: Only select the longest line in the file.
* `IgnoreLine`: Ignore any line containing a specified string.
* `Uniq`: Deduplicate content. Especially useful during the final `SampleContent` phase.

Note: All filters are implemented as their own self-contained function, which are easily extensible. Implementing your own filter should not require any changes to the core code.
Expand Down
165 changes: 110 additions & 55 deletions cmd/edmgen/cmd.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package main

import (
"errors"
"fmt"
"os"
"time"

"github.com/urfave/cli/v2"
"golang.org/x/exp/slices"

"github.com/crashdump/edmgen/pkg/edm"
"github.com/crashdump/edmgen/pkg/filters/content"
Expand All @@ -14,8 +16,17 @@ import (

var logger *logging

const STDOUT = "stdout"

func init() {
cli.HelpFlag = &cli.BoolFlag{Name: "help"}
cli.VersionFlag = &cli.BoolFlag{Name: "version", Aliases: []string{"v"}}
}

func main() {
logger = newLogger()
var flagOutput string
var flagFormat string

logger.print("┌────────┐")
logger.print("│ EDMGEN │")
Expand All @@ -24,83 +35,127 @@ func main() {

app := &cli.App{
Name: "edmgen",
Usage: "Walk files in a directory and extract samples for the purporse of Exact Data Match DLPs",
Usage: "Walk files in a directory and extract (samples) lines",
Compiled: time.Now(),
Authors: []*cli.Author{{
Name: "Adrien Pujol",
Email: "[email protected]",
}},
Flags: []cli.Flag{
&cli.StringSliceFlag{Name: "extensions", Aliases: []string{"E"}},
&cli.StringFlag{
Name: "output",
Aliases: []string{"o"},
Destination: &flagOutput,
Action: func(ctx *cli.Context, v string) error {
if v == "" {
return errors.New("please specify a filename")
}
return nil
},
},
&cli.StringFlag{
Name: "format",
Aliases: []string{"f"},
DefaultText: STDOUT,
Destination: &flagFormat,
Action: func(ctx *cli.Context, v string) error {
formats := []string{STDOUT, "csv", "txt"}
if slices.Contains[[]string](formats, flagFormat) {
return fmt.Errorf("output %s not currently supported", flagFormat)
}
return nil
},
},
},
Before: func(cCtx *cli.Context) error {
if cCtx.Args().Len() < 1 {
logger.printFatal("You need to specify a path to a folder")
logger.print("You need to specify a path to a folder")
os.Exit(1)
}
return nil
},
Action: run,
}
Action: func(cCtx *cli.Context) error {
edmc, err := edm.New(edm.Opts{})
if err != nil {
logger.printFatal(err.Error())
}

if err := app.Run(os.Args); err != nil {
logger.printFatal(err.Error())
}
}
path := cCtx.Args().First()

func run(cCtx *cli.Context) error {
edmc, err := edm.New(edm.Opts{})
if err != nil {
logger.printFatal(err.Error())
}
/*
* Search for all the relevant files
*/
logger.printHeader("Searching for relevant files...")
err = edmc.SelectFiles(path,
file.IgnoreFilename(ignoreFilenames),
file.IgnoreDirname(ignoreDirnames),
file.RequireExtensions(requireExtentions),
)
if err != nil {
logger.printFatal(err.Error())
}
logger.printfResult("Found %d files.", len(edmc.Paths))
logger.print("")

path := cCtx.Args().First()

/*
* Search for all the relevant files
*/
logger.printHeader("Searching for relevant files...")
err = edmc.SelectFiles(path,
file.IgnoreFilename(ignoreFilenames),
file.IgnoreDirname(ignoreDirnames),
file.RequireExtensions(requireExtentions),
)
if err != nil {
logger.printFatal(err.Error())
/*
* Walk through each of the files and sample lines
*/
logger.printHeader("Examining files...")
err = edmc.ExamineFiles(
content.IgnoreLine("serialVersionUID"),
content.LineLength(60, 120, true),
content.LongestLine,
)
if err != nil {
logger.printFatal(err.Error())
}
logger.printfResult("Selected %d lines.", len(edmc.Content))
logger.print("")

/*
* Sample the lines previously selected down to the result
*/
logger.printHeader("Sampling content...")
lines := edmc.SampleContent(
content.Uniq,
)
logger.printfResult("Sampled down to %d lines", len(lines))
logger.print("")

/*
* Finally, output the result
*/
switch flagFormat {
case "txt":
writeFileTxt(flagOutput, lines)

case "csv":
writeFileCsv(flagOutput, lines)

default:
writeStdout(lines)
}

logger.print("Complete!")
return nil
},
}
logger.printfResult("Found %d files.", len(edmc.Paths))
logger.print("")

/*
* Walk through each of the files and sample lines
*/
logger.printHeader("Examining files...")
err = edmc.ExamineFiles(
content.LineLength(40, 100),
content.LongestLine,
)
if err != nil {
if err := app.Run(os.Args); err != nil {
logger.printFatal(err.Error())
}
logger.printfResult("Selected %d lines.", len(edmc.Content))
logger.print("")

/*
* Sample the lines previously selected down to the result
*/
logger.printHeader("Sampling content...")
lines := edmc.SampleContent(
content.Uniq,
)
logger.printfResult("Sampled down to %d lines", len(edmc.Content))
logger.print("")
}

/*
* Finally, output the result
*/
func writeStdout(lines []string) {
for _, line := range lines {
fmt.Println(line)
}
}

func writeFileTxt(filename string, lines []string) {

}

func writeFileCsv(filename string, lines []string) {

logger.print("Complete!")
return nil
}
1 change: 1 addition & 0 deletions cmd/edmgen/ignore.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ var ignoreFilenames = []string{
}
var ignoreDirnames = []string{
"node_modules",
"documentation",
}

var requireExtentions = []string{
Expand Down
1 change: 0 additions & 1 deletion cmd/edmgen/logging.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,5 @@ func (l logging) printWarn(str string) {
}

func (l logging) printFatal(str string) {
l.logger.Print(fmt.Sprintf("[!!] %s", str))
panic(str)
}
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.20
require (
github.com/stretchr/testify v1.8.4
github.com/urfave/cli/v2 v2.25.7
golang.org/x/exp v0.0.0-20230725093048-515e97ebf090
)

require (
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ github.com/urfave/cli/v2 v2.25.7 h1:VAzn5oq403l5pHjc4OhD54+XGO9cdKVL/7lDjF+iKUs=
github.com/urfave/cli/v2 v2.25.7/go.mod h1:8qnjx1vcq5s2/wpsqoZFndg2CE5tNFyrTvS6SinrnYQ=
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU=
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8=
golang.org/x/exp v0.0.0-20230725093048-515e97ebf090 h1:Di6/M8l0O2lCLc6VVRWhgCiApHV8MnQurBnFSHsQtNY=
golang.org/x/exp v0.0.0-20230725093048-515e97ebf090/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
Expand Down
15 changes: 15 additions & 0 deletions pkg/filters/content/ignore.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package content

import "strings"

func IgnoreLine(str string) func(content []string) (result []string) {
return func(content []string) (result []string) {
for _, line := range content {
if strings.Contains(line, str) {
continue
}
result = append(result, line)
}
return result
}
}
64 changes: 64 additions & 0 deletions pkg/filters/content/ignore_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package content_test

import (
"testing"

"github.com/stretchr/testify/assert"

"github.com/crashdump/edmgen/pkg/filters/content"
)

func Test_IgnoreLine(t *testing.T) {
type args struct {
string string
}
tests := []struct {
name string
args args
content []string
want []string
}{
{
name: "empty-string",
content: []string{
"Donec a dui et dui fringilla consectetur id nec massa.",
" Nulla aliquet porttitor venenatis.",
"Nam tristique maximus ante hendrerit aliquet.",
"Integer dignissim posuere lobortis.",
" Aenean ultrices erat ut augue ultrices",
"Suspendisse lacinia ante nunc, pulvinar blandit nisl ornare ut.",
},
args: args{
string: "",
},
want: []string(nil),
},
{
name: "ignore-maximus",
content: []string{
"Donec a dui et dui fringilla consectetur id nec massa.",
"Nulla aliquet porttitor venenatis.",
"Nam tristique maximus ante hendrerit aliquet.",
"Integer dignissim posuere lobortis.",
" Aenean ultrices erat ut augue ultrices",
"Suspendisse lacinia ante nunc, pulvinar blandit nisl ornare ut.",
},
args: args{
string: "maximus",
},
want: []string{
"Donec a dui et dui fringilla consectetur id nec massa.",
"Nulla aliquet porttitor venenatis.",
"Integer dignissim posuere lobortis.",
" Aenean ultrices erat ut augue ultrices",
"Suspendisse lacinia ante nunc, pulvinar blandit nisl ornare ut.",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := content.IgnoreLine(tt.args.string)(tt.content)
assert.EqualValues(t, tt.want, got)
})
}
}
10 changes: 8 additions & 2 deletions pkg/filters/content/length.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
package content

func LineLength(min int, max int) func(content []string) (result []string) {
import "strings"

func LineLength(min int, max int, ignoreWhitespaces bool) func(content []string) (result []string) {
return func(content []string) (result []string) {
for _, line := range content {
if len(line) >= min && len(line) <= max {
length := len(line)
if ignoreWhitespaces {
length = len(strings.ReplaceAll(line, " ", ""))
}
if length >= min && length <= max {
result = append(result, line)
}
}
Expand Down
Loading

0 comments on commit f5db4d8

Please sign in to comment.