notebook improvements (#84)

Incorporated the feedback into the cyber foundation model directories and the notebook Authors: - https://github.com/gbatmaz Approvers: - https://github.com/hsin-c URL: #84
nv-morpheus · Jul 16, 2024 · ec11622 · ec11622
1 parent bdfca36
commit ec11622
Show file tree

Hide file tree

Showing 14 changed files with 2,062 additions and 36 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -5,4 +5,6 @@ phishing-url-detection/datasets/pipeline/data/** filter=lfs diff=lfs merge=lfs -
 phishing-url-detection/datasets/pipeline/models/** filter=lfs diff=lfs merge=lfs -text
 appshield-dga-detection/datasets/pipeline/models/** filter=lfs diff=lfs merge=lfs -text
 dga-detection/datasets/** filter=lfs diff=lfs merge=lfs -text
-dga-detection/models/** filter=lfs diff=lfs merge=lfs -text
+dga-detection/models/** filter=lfs diff=lfs merge=lfs -text
+cyber-foundation/dataset/prefixed_azure_ad_logs.txt filter=lfs diff=lfs merge=lfs -text
+cyber-foundation/model/cyber-foundation-model-prefix.pt filter=lfs diff=lfs merge=lfs -text
diff --git a/cyber-foundation/dataset/prefixed_azure_ad_logs.txt b/cyber-foundation/dataset/prefixed_azure_ad_logs.txt
diff --git a/cyber-foundation/model/cyber-foundation-model-prefix.pt b/cyber-foundation/model/cyber-foundation-model-prefix.pt
diff --git a/cyber-foundation/model/meta-prefix.pkl b/cyber-foundation/model/meta-prefix.pkl
diff --git a/cyber-foundation/model/train-prefix.bin b/cyber-foundation/model/train-prefix.bin
diff --git a/cyber-foundation/model/val-prefix.bin b/cyber-foundation/model/val-prefix.bin
diff --git a/cyber-foundation/training-inference/Browser_generated.png b/cyber-foundation/training-inference/Browser_generated.png
diff --git a/cyber-foundation/training-inference/Browser_training.png b/cyber-foundation/training-inference/Browser_training.png
diff --git a/cyber-foundation/training-inference/Rouge1.png b/cyber-foundation/training-inference/Rouge1.png
diff --git a/cyber-foundation/training-inference/cosine_similarity.png b/cyber-foundation/training-inference/cosine_similarity.png
diff --git a/cyber-foundation/training-inference/cyber-foundation.ipynb b/cyber-foundation/training-inference/cyber-foundation.ipynb
diff --git a/cyber-foundation/training-inference/prepare-prefix.py b/cyber-foundation/training-inference/prepare-prefix.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The base code is taken from https://github.com/karpathy/nanoGPT
+
+# The license on the original repository is below
+
+# MIT License
+
+# Copyright (c) 2022 Andrej Karpathy
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import json
+import os
+import pickle
+
+import numpy as np
+
+with open('../dataset/prefixed_azure_ad_logs.txt', 'r') as file:
+    data = file.read()
+
+# get all the unique characters that occur in this text
+chars = sorted(list(set(data)))
+vocab_size = len(chars)
+print("all the unique characters:", ''.join(chars))
+print(f"vocab size: {vocab_size:,}")
+
+# create a mapping from characters to integers
+stoi = {ch: i for i, ch in enumerate(chars)}
+itos = {i: ch for i, ch in enumerate(chars)}
+
+
+def encode(s):
+    # encoder: take a string, output a list of integers
+    return [stoi[c] for c in s]
+
+
+def decode(sequence):
+    # decoder: take a list of integers, output a string
+    return ''.join([itos[i] for i in sequence])
+
+
+# create the train and test splits
+n = len(data)
+train_data = data[:int(n * 0.9)]
+val_data = data[int(n * 0.9):]
+
+# encode both to integers
+train_ids = encode(train_data)
+val_ids = encode(val_data)
+print(f"train has {len(train_ids):,} tokens")
+print(f"val has {len(val_ids):,} tokens")
+
+# export to bin files
+train_ids = np.array(train_ids, dtype=np.uint16)
+val_ids = np.array(val_ids, dtype=np.uint16)
+train_ids.tofile(os.path.join(os.path.dirname(__file__), '../model/train-prefix.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), '../model/val-prefix.bin'))
+
+# save the meta information as well, to help us encode/decode later
+meta = {
+    'vocab_size': vocab_size,
+    'itos': itos,
+    'stoi': stoi,
+}
+with open(os.path.join(os.path.dirname(__file__), '../model/meta-prefix.pkl'), 'wb') as f:
+    pickle.dump(meta, f)
diff --git a/cyber-foundation/training-inference/sample-username.py b/cyber-foundation/training-inference/sample-username.py
@@ -0,0 +1,119 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The base code is taken from https://github.com/karpathy/nanoGPT
+
+# The license on the original repository is below
+
+# MIT License
+
+# Copyright (c) 2022 Andrej Karpathy
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import pickle
+import sys
+from contextlib import nullcontext
+
+import torch
+from model import GPT
+from model import GPTConfig
+
+init_from = 'resume'  # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
+out_dir = '../model/'  # ignored if init_from is not 'resume'
+start = sys.argv[1]  # or "" or etc. Can also specify a file, use as: "FILE:prompt.txt"
+num_samples = 10  # number of samples to draw
+max_new_tokens = 5000  # number of tokens generated in each sample
+temperature = 0.8  # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
+top_k = 200  # retain only the top_k most likely tokens, clamp others to have 0 probability
+seed = 1337
+device = 'cuda'  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
+dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported(
+) else 'float16'  # 'float32' or 'bfloat16' or 'float16'
+compile = False  # use PyTorch 2.0 to compile the model to be faster
+
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
+torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
+device_type = 'cuda' if 'cuda' in device else 'cpu'  # for later use in torch.autocast
+ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+
+# model
+if init_from == 'resume':
+    # init from a model saved in a specific directory
+    ckpt_path = os.path.join(out_dir, '../model/cyber-foundation-model-prefix.pt')
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    gptconf = GPTConfig(**checkpoint['model_args'])
+    model = GPT(gptconf)
+    state_dict = checkpoint['model']
+    unwanted_prefix = '_orig_mod.'
+    for k, v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+    model.load_state_dict(state_dict)
+elif init_from.startswith('gpt2'):
+    # init from a given GPT-2 model
+    model = GPT.from_pretrained(init_from, dict(dropout=0.0))
+
+model.eval()
+model.to(device)
+if compile:
+    model = torch.compile(model)  # requires PyTorch 2.0 (optional)
+
+with open("../model/meta.pkl", 'rb') as f:
+    meta = pickle.load(f)
+# TODO want to make this more general to arbitrary encoder/decoder schemes
+stoi, itos = meta['stoi'], meta['itos']
+
+
+def encode(s):
+    return [stoi[c] for c in s]
+
+
+def decode(sequence):
+    return ''.join([itos[i] for i in sequence])
+
+
+start_ids = encode(start)
+
+x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
+
+# run generation
+with torch.no_grad():
+    with ctx:
+        for k in range(num_samples):
+            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
+            decoded_text = decode(y[0].tolist())
+            print(decoded_text)
+            print('---------------')