-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Incorporated the feedback into the cyber foundation model directories and the notebook Authors: - https://github.com/gbatmaz Approvers: - https://github.com/hsin-c URL: #84
- Loading branch information
Showing
14 changed files
with
2,062 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Git LFS file not shown
Git LFS file not shown
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1,531 changes: 1,496 additions & 35 deletions
1,531
cyber-foundation/training-inference/cyber-foundation.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# The base code is taken from https://github.com/karpathy/nanoGPT | ||
|
||
# The license on the original repository is below | ||
|
||
# MIT License | ||
|
||
# Copyright (c) 2022 Andrej Karpathy | ||
|
||
# Permission is hereby granted, free of charge, to any person obtaining a copy | ||
# of this software and associated documentation files (the "Software"), to deal | ||
# in the Software without restriction, including without limitation the rights | ||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
# copies of the Software, and to permit persons to whom the Software is | ||
# furnished to do so, subject to the following conditions: | ||
|
||
# The above copyright notice and this permission notice shall be included in all | ||
# copies or substantial portions of the Software. | ||
|
||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
# SOFTWARE. | ||
|
||
import json | ||
import os | ||
import pickle | ||
|
||
import numpy as np | ||
|
||
with open('../dataset/prefixed_azure_ad_logs.txt', 'r') as file: | ||
data = file.read() | ||
|
||
# get all the unique characters that occur in this text | ||
chars = sorted(list(set(data))) | ||
vocab_size = len(chars) | ||
print("all the unique characters:", ''.join(chars)) | ||
print(f"vocab size: {vocab_size:,}") | ||
|
||
# create a mapping from characters to integers | ||
stoi = {ch: i for i, ch in enumerate(chars)} | ||
itos = {i: ch for i, ch in enumerate(chars)} | ||
|
||
|
||
def encode(s): | ||
# encoder: take a string, output a list of integers | ||
return [stoi[c] for c in s] | ||
|
||
|
||
def decode(sequence): | ||
# decoder: take a list of integers, output a string | ||
return ''.join([itos[i] for i in sequence]) | ||
|
||
|
||
# create the train and test splits | ||
n = len(data) | ||
train_data = data[:int(n * 0.9)] | ||
val_data = data[int(n * 0.9):] | ||
|
||
# encode both to integers | ||
train_ids = encode(train_data) | ||
val_ids = encode(val_data) | ||
print(f"train has {len(train_ids):,} tokens") | ||
print(f"val has {len(val_ids):,} tokens") | ||
|
||
# export to bin files | ||
train_ids = np.array(train_ids, dtype=np.uint16) | ||
val_ids = np.array(val_ids, dtype=np.uint16) | ||
train_ids.tofile(os.path.join(os.path.dirname(__file__), '../model/train-prefix.bin')) | ||
val_ids.tofile(os.path.join(os.path.dirname(__file__), '../model/val-prefix.bin')) | ||
|
||
# save the meta information as well, to help us encode/decode later | ||
meta = { | ||
'vocab_size': vocab_size, | ||
'itos': itos, | ||
'stoi': stoi, | ||
} | ||
with open(os.path.join(os.path.dirname(__file__), '../model/meta-prefix.pkl'), 'wb') as f: | ||
pickle.dump(meta, f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# The base code is taken from https://github.com/karpathy/nanoGPT | ||
|
||
# The license on the original repository is below | ||
|
||
# MIT License | ||
|
||
# Copyright (c) 2022 Andrej Karpathy | ||
|
||
# Permission is hereby granted, free of charge, to any person obtaining a copy | ||
# of this software and associated documentation files (the "Software"), to deal | ||
# in the Software without restriction, including without limitation the rights | ||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
# copies of the Software, and to permit persons to whom the Software is | ||
# furnished to do so, subject to the following conditions: | ||
|
||
# The above copyright notice and this permission notice shall be included in all | ||
# copies or substantial portions of the Software. | ||
|
||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
# SOFTWARE. | ||
|
||
import os | ||
import pickle | ||
import sys | ||
from contextlib import nullcontext | ||
|
||
import torch | ||
from model import GPT | ||
from model import GPTConfig | ||
|
||
init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl') | ||
out_dir = '../model/' # ignored if init_from is not 'resume' | ||
start = sys.argv[1] # or "" or etc. Can also specify a file, use as: "FILE:prompt.txt" | ||
num_samples = 10 # number of samples to draw | ||
max_new_tokens = 5000 # number of tokens generated in each sample | ||
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions | ||
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability | ||
seed = 1337 | ||
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc. | ||
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported( | ||
) else 'float16' # 'float32' or 'bfloat16' or 'float16' | ||
compile = False # use PyTorch 2.0 to compile the model to be faster | ||
|
||
torch.manual_seed(seed) | ||
torch.cuda.manual_seed(seed) | ||
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul | ||
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn | ||
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast | ||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] | ||
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) | ||
|
||
# model | ||
if init_from == 'resume': | ||
# init from a model saved in a specific directory | ||
ckpt_path = os.path.join(out_dir, '../model/cyber-foundation-model-prefix.pt') | ||
checkpoint = torch.load(ckpt_path, map_location=device) | ||
gptconf = GPTConfig(**checkpoint['model_args']) | ||
model = GPT(gptconf) | ||
state_dict = checkpoint['model'] | ||
unwanted_prefix = '_orig_mod.' | ||
for k, v in list(state_dict.items()): | ||
if k.startswith(unwanted_prefix): | ||
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) | ||
model.load_state_dict(state_dict) | ||
elif init_from.startswith('gpt2'): | ||
# init from a given GPT-2 model | ||
model = GPT.from_pretrained(init_from, dict(dropout=0.0)) | ||
|
||
model.eval() | ||
model.to(device) | ||
if compile: | ||
model = torch.compile(model) # requires PyTorch 2.0 (optional) | ||
|
||
with open("../model/meta.pkl", 'rb') as f: | ||
meta = pickle.load(f) | ||
# TODO want to make this more general to arbitrary encoder/decoder schemes | ||
stoi, itos = meta['stoi'], meta['itos'] | ||
|
||
|
||
def encode(s): | ||
return [stoi[c] for c in s] | ||
|
||
|
||
def decode(sequence): | ||
return ''.join([itos[i] for i in sequence]) | ||
|
||
|
||
start_ids = encode(start) | ||
|
||
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]) | ||
|
||
# run generation | ||
with torch.no_grad(): | ||
with ctx: | ||
for k in range(num_samples): | ||
y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k) | ||
decoded_text = decode(y[0].tolist()) | ||
print(decoded_text) | ||
print('---------------') |
Oops, something went wrong.