Checkpoint work.

galv · Dec 6, 2023 · cb6d4d9 · cb6d4d9
1 parent 74e09f1
commit cb6d4d9
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 20 deletions.
diff --git a/examples/asr/export/transducer/infer_transducer_trt.py b/examples/asr/export/transducer/infer_transducer_trt.py
@@ -159,7 +159,7 @@ def main():
 
     # Evaluate Pytorch Model (CPU/GPU)
     torch.cuda.cudart().cudaProfilerStart()
-    with torch.inference_mode(): # , torch.autocast("cuda"):
+    with torch.inference_mode(), torch.autocast("cuda"):
         actual_transcripts = nemo_model.transcribe(audio_filepath, batch_size=args.batch_size)[0]
     print("GALVEZ:")
     for at in actual_transcripts:

diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
@@ -293,13 +293,16 @@ def transcribe(
                 if augmentor:
                     config['augmentor'] = augmentor
 
+                print("GALVEZ:augmentor=", augmentor)
+
                 temporary_datalayer = self._setup_transcribe_dataloader(config)
                 for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=(not verbose)):
                     torch.cuda.nvtx.range_push("encoder")
                     encoded, encoded_len = self.forward(
                         input_signal=test_batch[0].to(device),
                         input_signal_length=test_batch[1].to(device)
                     )
+                    # print("GALVEZ:encoded=", encoded)
                     torch.cuda.nvtx.range_pop()
                     torch.cuda.nvtx.range_push("decoding")
                     best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor(
@@ -316,6 +319,8 @@ def transcribe(
                     else:
                         all_hypotheses += best_hyp
 
+                    print("GALVEZ:best_hyp=", best_hyp)
+
                     del encoded
                     del test_batch
         finally:

diff --git a/nemo/collections/asr/parts/submodules/fast_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/fast_rnnt_greedy_decoding.py
@@ -314,6 +314,8 @@ def __call__(
 
                     # This seems wrong. Do I need to negate this?
                     k.masked_scatter_(self.blank_mask, self.last_label)
+                    # This doesn't seem right. Why is my last label blank? It should be SOS, right?
+                    # I should not copy k if last_label is SOS, right?
                     self.last_label.copy_(k)
 
                     # It seems that I am unconditionally copying. That is wrong... I should do a masked copy
@@ -345,40 +347,36 @@ def __call__(
             torch.cuda.cudart().cudaProfilerStart()
             cu_call(cudart.cudaGraphLaunch(self.graph_exec, torch.cuda.current_stream().cuda_stream))
             cu_call(cudart.cudaStreamSynchronize(torch.cuda.current_stream().cuda_stream))
-            torch.cuda.cudart().cudaProfilerStop()
             end = time.time()
             print("total time:", end - start)
 
+            torch.set_printoptions(threshold=100_000)
             print("GALVEZ:", self.symbols_per_time_step_cpu)
+            print("GALVEZ:scores=", self.scores_cpu)
+            print("GALVEZ:labels=", self.labels_cpu)
+            print("GALVEZ:symbols_per_time_step=", self.symbols_per_time_step_cpu)
+
 
+            torch.cuda.nvtx.range_push("Copy data out")
+            # js = torch.zeros(batch_size, dtype=torch.int64, device="cpu")
             j = 0
             for t in range(max_time):
                 max_non_blank_symbols = self.symbols_per_time_step_cpu[t]
+                print("GALVEZ:", t, max_non_blank_symbols)
                 for _ in range(max_non_blank_symbols):
                     for i in range(batch_size):
                         if self.labels_cpu[j, i] == caller._blank_index:
+                            # Ooops! This is not correct!!!!! It's continue... It's fine...
                             continue
                         hypotheses[i].y_sequence.append(self.labels_cpu[j, i])
                         hypotheses[i].timestep.append(t)
                         hypotheses[i].score += self.scores_cpu[j, i]
                     j += 1
-                # for i in range(batch_size):
-                #         j = 
-                #         hypotheses[i].y_sequence.append(self.labels_cpu[, i])
+            torch.cuda.nvtx.range_pop()
+            torch.cuda.cudart().cudaProfilerStop()
 
             print("NEW:", hypotheses)
 
-            # import ipdb; ipdb.set_trace()
-
-            # out_len_cpu = out_len.to("cpu")
-            # for i, t in product(range(batch_size), range(out_len_cpu[i])):
-            #     # Need best_label at each seq_idx_t
-            #     # Need time_idx_t as well, can derive via dividing by max_symbols_per_step? No, that is not true.
-            #     # Need score, which comes from v
-            #     j = 0
-            #     while j < self.labels_cpu.shape[2] and self.labels_cpu[i, t, j] != caller._blank_index:
-            #         hypotheses[i].y_sequence.append(self.labels_cpu[i, t, j])
-            #         hypotheses[i].timestep.append(t)
-            #         hypotheses[i].score += self.scores_cpu[i, t, j]
-            #         j += 1
+            import ipdb; ipdb.set_trace()
+
             return hypotheses
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -603,8 +603,8 @@ def forward(
 
             with self.decoder.as_frozen(), self.joint.as_frozen():
                 inseq = encoder_output  # [B, T, D]
-                # inseq = inseq[:, :5, :]
-                # logitlen.fill_(5)
+                # inseq = inseq[:, :1, :]
+                # logitlen.fill_(1)
                 if isinstance(self._greedy_decode, RNNTGreedyDecodeFast):
                     hypotheses = self._greedy_decode(
                         self, inseq, logitlen, device=inseq.device, partial_hypotheses=partial_hypotheses