You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Very good work. I had some problems during training. When I reached 100000 steps, I would calculate l1 and lpips loss in nll, but a CUDA error would appear every time. Is this normal? The GPU I use is 3090, which cannot be used on a single machine or multiple machines. The command is python3 -m src.main +experiment=re 10k data_loader.train.batch_size=1
The errors are:
File "/root/code/latentsplat/src/loss/loss_group.py", line 74, in forward_generator
adaptive_weight = self.get_adaptive_weight(total_loss, generator_loss.unweighted, last_layer_weights)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/jaxtyping/_decorator.py", line 453, in wrapped_fn
out = fn(*args, **kwargs)
File "/root/code/latentsplat/src/loss/loss_group.py", line 39, in get_adaptive_weight
nll_grads = torch.autograd.grad(nll_loss, last_layer_weights, retain_graph=True)[0]
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/torch/autograd/__init__.py", line 303, in grad
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
and
Traceback (most recent call last):
File "/opt/conda/envs/latentsplat/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/envs/latentsplat/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/root/code/latentsplat/src/main.py", line 169, in<module>train()
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
returnfunc()
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/hydra/_internal/utils.py", line 458, in<lambda>
lambda: hydra.run(
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/root/code/latentsplat/src/main.py", line 159, in train
trainer.fit(model_wrapper, datamodule=data_module, ckpt_path=checkpoint_path if cfg.checkpointing.resume else None, )
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit
call._call_and_handle_interrupt(
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 987, in _run
results = self._run_stage()
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1033, in _run_stage
self.fit_loop.run()
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 205, in run
self.advance()
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 363, in advance
self.epoch_loop.run(self._data_fetcher)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 140, in run
self.advance(data_fetcher)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 252, in advance
batch_output = self.manual_optimization.run(kwargs)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/manual.py", line 94, in run
self.advance(kwargs)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/manual.py", line 114, in advance
training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 309, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 391, in training_step
return self.lightning_module.training_step(*args, **kwargs)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/jaxtyping/_decorator.py", line 453, in wrapped_fn
out = fn(*args, **kwargs)
File "/root/code/latentsplat/src/model/model_wrapper.py", line 444, in training_step
self.manual_backward(generator_loss)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1071, in manual_backward
self.trainer.strategy.backward(loss, None, *args, **kwargs)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 213, in backward
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision.py", line 72, in backward
model.backward(tensor, *args, **kwargs)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1090, in backward
loss.backward(*args, **kwargs)
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/torch/_tensor.py", line 487, in backward
torch.autograd.backward(
File "/opt/conda/envs/latentsplat/lib/python3.10/site-packages/torch/autograd/__init__.py", line 200, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: CUDA error: invalid argument
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
I checked the code, and these errors happened at here at 100000 steps.
Very good work. I had some problems during training. When I reached 100000 steps, I would calculate
![image](https://private-user-images.githubusercontent.com/29742046/339302888-80c84553-76bf-41ca-b73d-1787e1655156.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MzkyNjQwNTUsIm5iZiI6MTczOTI2Mzc1NSwicGF0aCI6Ii8yOTc0MjA0Ni8zMzkzMDI4ODgtODBjODQ1NTMtNzZiZi00MWNhLWI3M2QtMTc4N2UxNjU1MTU2LnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNTAyMTElMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjUwMjExVDA4NDkxNVomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPTZmNjZmNGMyY2I5NjdkNmQzY2JhNTZiZWMzNzdmMDE0YTJiNTMwNzc3OTI1YzkwNzdjNzEwMjU1OTJkMzRjZjAmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0In0.mpL4Nhk0-W6givmRlTt1FG3pFNY8A38mcpRxkivnYPk)
l1
andlpips
loss innll
, but a CUDA error would appear every time. Is this normal? The GPU I use is 3090, which cannot be used on a single machine or multiple machines. The command ispython3 -m src.main +experiment=re 10k data_loader.train.batch_size=1
The errors are:
and
I checked the code, and these errors happened at here at 100000 steps.
latentsplat/src/model/model_wrapper.py
Line 440 in 2404699
The text was updated successfully, but these errors were encountered: