diff --git a/cmd/slackdump/internal/man/assets/changelog.md b/cmd/slackdump/internal/man/assets/changelog.md index 146ce925..ebc17be1 100644 --- a/cmd/slackdump/internal/man/assets/changelog.md +++ b/cmd/slackdump/internal/man/assets/changelog.md @@ -21,6 +21,10 @@ `workspace select` command; - The "_Current_" workspace can be overridden by providing the `-w ` flag. +- Slackdump `record` mode allows to dump the entire workspace into a directory + of chunk files. +- Slackdump `convert` mode allows to convert chunk files into other formats, + such as Slack export format, or Slackdump format. ## Changes diff --git a/cmd/slackdump/internal/man/assets/chunk.md b/cmd/slackdump/internal/man/assets/chunk.md new file mode 100644 index 00000000..8d6b72ed --- /dev/null +++ b/cmd/slackdump/internal/man/assets/chunk.md @@ -0,0 +1,150 @@ +# Slackdump Chunk File Format + +## Introduction + +Chunk file format is a gzip-compressed JSONL file with each line being a JSON +object. + +The benefit of chunk file format is that it can be converted to other formats, +such as Slack export format, or Slackdump format. Chunk file format is used +internally by Slackdump during processing of the API output, it allows for +concurrent processing, minimising the memory usage during transformation +phase. + + +## Chunk file format specification + +The structure of the chunk file is better represented by the following code +snippet: + +```go +type Chunk struct { + Type ChunkType `json:"t"` + Timestamp int64 `json:"ts"` + ChannelID string `json:"id,omitempty"` + Count int `json:"n,omitempty"` + ThreadTS string `json:"r,omitempty"` + IsLast bool `json:"l,omitempty"` + NumThreads int `json:"nt,omitempty"` + Channel *slack.Channel `json:"ci,omitempty"` + Parent *slack.Message `json:"p,omitempty"` + Messages []slack.Message `json:"m,omitempty"` + Files []slack.File `json:"f,omitempty"` + Users []slack.User `json:"u,omitempty"` + Channels []slack.Channel `json:"ch,omitempty"` + WorkspaceInfo *slack.AuthTestResponse `json:"w,omitempty"` +} +``` + +Sample chunk JSON message: + +```json +{ + "t": 5, + "ts": 1683022288506765000, + "id": "CHYLGDP0D", + "ci": { + "id": "CHYLGDP0D", + "created": 1555493778, + "is_open": false, + "last_read": "1682743815.053209", + "name_normalized": "random", + "name": "random", + //... + } +} +``` + +## Fields + +### t: Chunk type + +Each JSON object can contain the following "chunk" of information, denoted as +unsigned 8-bit integer: +- **Type 0**: slice of channel messages; +- **Type 1**: slice of channel message replies (a thread); +- **Type 2**: slice of files that were uploaded to the workspace (only definitions); +- **Type 3**: slice of channels; +- **Type 4**: slice of users; +- **Type 5**: workspace information. + +Each chunk type is a direct mapping to the Slack API method that was used to +retrieve the data: + +- **Type 0**: [conversations.history](https://api.slack.com/methods/conversations.history); +- **Type 1**: [conversations.replies](https://api.slack.com/methods/conversations.replies); +- **Type 2**: [files.list](https://api.slack.com/methods/files.list); +- **Type 3**: [conversations.list](https://api.slack.com/methods/conversations.list); +- **Type 4**: [users.list](https://api.slack.com/methods/users.list); +- **Type 5**: [auth.test](https://api.slack.com/methods/auth.test). + +Message type value is guaranteed to be immutable in the future. If a new +message type is added, it will be added as a new value, and the existing +values will not be changed. + +### ts: Timestamp + +The timestamp is a Unix timestamp in nanoseconds. It contains the timestamp +of when the chunk was recorded. + +### id: Channel ID + +The channel ID is a string that contains the ID of the channel that the chunk +belongs to. It is only populated for chunks of type 0, 1, and 2. + +### n: Number of messages or files +The number of messages or files is an integer that contains the number of +messages or files that are contained in the chunk. It is only populated for +chunks of type 0, 1, and 2. + +### r: Thread timestamp + +The thread timestamp is a string that contains the timestamp of the thread +that the chunk belongs to. It is only populated for chunks of type 1. + +### l: Is last chunk + +The is last chunk is a boolean that is set to true if the chunk is the last +chunk for the channel or thread. It is only populated for chunks of type 0 +and 1. + +### nt: Number of threads + +The number of threads is an integer that contains the number of threads that +are contained in the chunk. It is only populated for chunks of type 0. + +### ci: Channel information + +The channel information contains the channel information as returned by the +API. It is only populated for chunks of type 0, 1, and 2. + +### p: Parent message + +The parent message contains the parent message for a thread or a file chunk. +It is only populated for chunks of type 1 and 2. + +### m: Messages + +The messages contains a chunk of messages as returned by the API. It is only +populated for chunks of type 0 and 1. This slice size can be in range from 1 +to 1000 for message type chunks. + +### f: Files + +The files contains a chunk of files as returned by the API. It is only +populated for chunks of type 2. + +### u: Users + +The users contains a chunk of users as returned by the API. It is only +populated for chunks of type 4. + +### ch: Channels + +The channels contains a chunk of channels as returned by the API. It is only +populated for chunks of type 3. + +### w: Workspace information + +The workspace information contains the workspace information. It is only +populated for chunks of type 5. diff --git a/cmd/slackdump/internal/man/chunk.go b/cmd/slackdump/internal/man/chunk.go new file mode 100644 index 00000000..47ac231a --- /dev/null +++ b/cmd/slackdump/internal/man/chunk.go @@ -0,0 +1,16 @@ +package man + +import ( + _ "embed" + + "github.com/rusq/slackdump/v2/cmd/slackdump/internal/golang/base" +) + +//go:embed assets/chunk.md +var mdChunk string + +var Chunk = &base.Command{ + UsageLine: "slackdump chunk", + Short: "chunk file format specification", + Long: mdChunk, +} diff --git a/cmd/slackdump/internal/record/assets/record.md b/cmd/slackdump/internal/record/assets/record.md new file mode 100644 index 00000000..edcb73b1 --- /dev/null +++ b/cmd/slackdump/internal/record/assets/record.md @@ -0,0 +1,27 @@ +# Command Record + +The record command runs the complete dump of the workspace. The dump is in +the "Chunk" file format. + +## What does Record dump? + +Record behaves similarly to the Slack export feature, the output of a +successful run contains the following: +- channels.json.gz - list of channels in the workspace; +- users.json.gz - list of users in the workspace; +- CXXXXXXX.json.gz - channel or group conversation messages, where XXXXXXX is + the channel ID; +- DXXXXXXX.json.gz - direct messages, where XXXXXXX is the user ID; + +Please note that these are not traditional JSON files, but rather JSONL files, +where each line is a JSON object. This is done to minimise the memory usage +for processing. + +Another difference to the Slack export is that the output is not a single +archive, but rather a directory with files. Slackdump does not support writing +chunk files into a ZIP file, and strictly speaking, it is not necessary, as +chunk files are already compressed. + +## Chunk file format + +See `slackdump help chunk` for the format specification. diff --git a/cmd/slackdump/internal/record/record.go b/cmd/slackdump/internal/record/record.go new file mode 100644 index 00000000..3fd13120 --- /dev/null +++ b/cmd/slackdump/internal/record/record.go @@ -0,0 +1,33 @@ +package record + +import ( + "context" + _ "embed" + + "github.com/rusq/slackdump/v2/auth" + "github.com/rusq/slackdump/v2/cmd/slackdump/internal/cfg" + "github.com/rusq/slackdump/v2/cmd/slackdump/internal/golang/base" +) + +//go:embed assets/record.md +var mdRecord string + +var CmdRecord = &base.Command{ + Run: RunRecord, + UsageLine: "slackdump record [link1[ link 2[ link N]]]", + Short: "record the dump of the workspace or individual conversations", + Long: mdRecord, + FlagMask: cfg.OmitUserCacheFlag | cfg.OmitCacheDir, + RequireAuth: true, + PrintFlags: true, +} + +func RunRecord(ctx context.Context, cmd *base.Command, args []string) error { + prov, err := auth.FromContext(ctx) + if err != nil { + base.SetExitStatus(base.SAuthError) + return err + } + _ = prov + return nil +} diff --git a/cmd/slackdump/main.go b/cmd/slackdump/main.go index 0c926dad..1539156a 100644 --- a/cmd/slackdump/main.go +++ b/cmd/slackdump/main.go @@ -29,6 +29,7 @@ import ( "github.com/rusq/slackdump/v2/cmd/slackdump/internal/golang/help" "github.com/rusq/slackdump/v2/cmd/slackdump/internal/list" "github.com/rusq/slackdump/v2/cmd/slackdump/internal/man" + "github.com/rusq/slackdump/v2/cmd/slackdump/internal/record" v1 "github.com/rusq/slackdump/v2/cmd/slackdump/internal/v1" "github.com/rusq/slackdump/v2/cmd/slackdump/internal/wizard" "github.com/rusq/slackdump/v2/cmd/slackdump/internal/workspace" @@ -48,6 +49,7 @@ func init() { wizard.CmdWizard, export.CmdExport, dump.CmdDump, + record.CmdRecord, list.CmdList, emoji.CmdEmoji, workspace.CmdWorkspace, @@ -57,8 +59,9 @@ func init() { convert.CmdConvert, CmdVersion, - man.Login, man.WhatsNew, + man.Login, + man.Chunk, } } diff --git a/internal/structures/entity_list.go b/internal/structures/entity_list.go index b9c89039..81c11a25 100644 --- a/internal/structures/entity_list.go +++ b/internal/structures/entity_list.go @@ -181,6 +181,7 @@ func buildEntryIndex(links []string) (map[string]bool, error) { } files = append(files, trimmed) default: + // no prefix sl, err := ParseLink(ent) if err != nil { return nil, err