from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM, AutoTokenizer import torch import jsonlines device = "cuda" peft_model_id="/dccstor/weiz/bikinie/peft/pt/sft_debugging/fms-hf-tuning/pt_ckpts/checkpoint-5730" data_path="/dccstor/weiz/remote_pycharm_dev/data/train_tiny.json" config = PeftConfig.from_pretrained(peft_model_id) model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, torch_dtype=torch.bfloat16) #model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, torch_dtype=torch.bfloat16,_attn_implementation="flash_attention_2") model = PeftModel.from_pretrained(model, peft_model_id) tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) with jsonlines.open(data_path) as f: for idx, line in enumerate(f.iter()): # Load the JSON object from the line data = line # Extract 'input' and 'output' from the JSON object input_str = data['input'] break #print(input_str) inputs = tokenizer( input_str, return_tensors="pt", ) model.to(device) model.eval() with torch.no_grad(): inputs = {k: v.to(device) for k, v in inputs.items()} #outputs = model(**inputs) #print(outputs) #loss = outputs.loss.detach().float() #print("loss: ", loss) input_len = inputs["attention_mask"][0].shape[-1] print("input_len: ", input_len) outputs = model.generate( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=256, eos_token_id=3 ) len_outputs = outputs.shape[-1] print("len_outputs: ", len_outputs) outputs = outputs[:, input_len:] output_str = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0] print("output_str: ") print(output_str)