ufal · Gldkslfmsd · Apr 18, 2024 · Apr 14, 2024 · Apr 14, 2024 · Apr 14, 2024
diff --git a/whisper_online.py b/whisper_online.py
@@ -4,10 +4,15 @@
 import librosa  
 from functools import lru_cache
 import time
+import logging
+
+
 import io
 import soundfile as sf
 import math
 
+logger = logging.getLogger(__name__)
+
 @lru_cache
 def load_audio(fname):
     a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
@@ -62,7 +67,7 @@ def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
         from whisper_timestamped import transcribe_timestamped
         self.transcribe_timestamped = transcribe_timestamped
         if model_dir is not None:
-            print("ignoring model_dir, not implemented",file=self.logfile)
+            logger.debug("ignoring model_dir, not implemented")
         return whisper.load_model(modelsize, download_root=cache_dir)
 
     def transcribe(self, audio, init_prompt=""):
@@ -101,8 +106,9 @@ class FasterWhisperASR(ASRBase):
 
     def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
         from faster_whisper import WhisperModel
+        logging.getLogger("faster_whisper").setLevel(logger.level)
         if model_dir is not None:
-            print(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.",file=self.logfile)
+            logger.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
             model_size_or_path = model_dir
         elif modelsize is not None:
             model_size_or_path = modelsize
@@ -225,7 +231,7 @@ def transcribe(self, audio_data, prompt=None, *args, **kwargs):
 
         # Process transcription/translation
         transcript = proc.create(**params)
-        print(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds",file=self.logfile)
+        logger.debug(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds")
 
         return transcript
 
@@ -268,9 +274,11 @@ def insert(self, new, offset):
                         c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
                         tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
                         if c == tail:
-                            print("removing last",i,"words:",file=self.logfile)
+                            words = []
                             for j in range(i):
-                                print("\t",self.new.pop(0),file=self.logfile)
+                                words.append(repr(self.new.pop(0)))
+                            words_msg = "\t".join(words)
+                            logger.debug(f"removing last {i} words: {words_msg}")
                             break
 
     def flush(self):
@@ -359,9 +367,9 @@ def process_iter(self):
         """
 
         prompt, non_prompt = self.prompt()
-        print("PROMPT:", prompt, file=self.logfile)
-        print("CONTEXT:", non_prompt, file=self.logfile)
-        print(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}",file=self.logfile)
+        logger.debug(f"PROMPT: {prompt}")
+        logger.debug(f"CONTEXT: {non_prompt}")
+        logger.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
         res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
 
         # transform to [(beg,end,"word1"), ...]
@@ -370,8 +378,10 @@ def process_iter(self):
         self.transcript_buffer.insert(tsw, self.buffer_time_offset)
         o = self.transcript_buffer.flush()
         self.commited.extend(o)
-        print(">>>>COMPLETE NOW:",self.to_flush(o),file=self.logfile,flush=True)
-        print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
+        completed = self.to_flush(o)
+        logger.debug(f">>>>COMPLETE NOW: {completed}")
+        the_rest = self.to_flush(self.transcript_buffer.complete())
+        logger.debug(f"INCOMPLETE: {the_rest}")
 
         # there is a newly confirmed text
 
@@ -395,26 +405,26 @@ def process_iter(self):
             #while k>0 and self.commited[k][1] > l:
             #    k -= 1
             #t = self.commited[k][1] 
-            print(f"chunking segment",file=self.logfile)
+            logger.debug(f"chunking segment")
             #self.chunk_at(t)
 
-        print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)
+        logger.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
         return self.to_flush(o)
 
     def chunk_completed_sentence(self):
         if self.commited == []: return
-        print(self.commited,file=self.logfile)
+        logger.debug(self.commited)
         sents = self.words_to_sentences(self.commited)
         for s in sents:
-            print("\t\tSENT:",s,file=self.logfile)
+            logger.debug(f"\t\tSENT: {s}")
         if len(sents) < 2:
             return
         while len(sents) > 2:
             sents.pop(0)
         # we will continue with audio processing at this timestamp
         chunk_at = sents[-2][1]
 
-        print(f"--- sentence chunked at {chunk_at:2.2f}",file=self.logfile)
+        logger.debug(f"--- sentence chunked at {chunk_at:2.2f}")
         self.chunk_at(chunk_at)
 
     def chunk_completed_segment(self, res):
@@ -431,12 +441,12 @@ def chunk_completed_segment(self, res):
                 ends.pop(-1)
                 e = ends[-2]+self.buffer_time_offset
             if e <= t:
-                print(f"--- segment chunked at {e:2.2f}",file=self.logfile)
+                logger.debug(f"--- segment chunked at {e:2.2f}")
                 self.chunk_at(e)
             else:
-                print(f"--- last segment not within commited area",file=self.logfile)
+                logger.debug(f"--- last segment not within commited area")
         else:
-            print(f"--- not enough segments to chunk",file=self.logfile)
+            logger.debug(f"--- not enough segments to chunk")
 
 
 
@@ -482,7 +492,7 @@ def finish(self):
         """
         o = self.transcript_buffer.complete()
         f = self.to_flush(o)
-        print("last, noncommited:",f,file=self.logfile)
+        logger.debug("last, noncommited: {f}")
         return f
 
 
@@ -522,7 +532,7 @@ def split(self, text):
 
     # the following languages are in Whisper, but not in wtpsplit:
     if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
-        print(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.", file=sys.stderr)
+        logger.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
         lan = None
 
     from wtpsplit import WtP
@@ -548,14 +558,15 @@ def add_shared_args(parser):
     parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
     parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')
     parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.')
+    parser.add_argument("-l", "--log-level", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the log level", default='DEBUG')
 
 def asr_factory(args, logfile=sys.stderr):
     """
     Creates and configures an ASR and ASR Online instance based on the specified backend and arguments.
     """
     backend = args.backend
     if backend == "openai-api":
-        print("Using OpenAI API.", file=logfile)
+        logger.debug("Using OpenAI API.")
         asr = OpenaiApiASR(lan=args.lan)
     else:
         if backend == "faster-whisper":
@@ -566,14 +577,14 @@ def asr_factory(args, logfile=sys.stderr):
         # Only for FasterWhisperASR and WhisperTimestampedASR
         size = args.model
         t = time.time()
-        print(f"Loading Whisper {size} model for {args.lan}...", file=logfile, end=" ", flush=True)
+        logger.debug(f"Loading Whisper {size} model for {args.lan}...")
         asr = asr_cls(modelsize=size, lan=args.lan, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
         e = time.time()
-        print(f"done. It took {round(e-t,2)} seconds.", file=logfile)
+        logger.debug(f"done. It took {round(e-t,2)} seconds.")
 
     # Apply common configurations
     if getattr(args, 'vad', False):  # Checks if VAD argument is present and True
-        print("Setting VAD filter", file=logfile)
+        logger.info("Setting VAD filter")
         asr.use_vad()
 
     language = args.lan
@@ -611,14 +622,18 @@ def asr_factory(args, logfile=sys.stderr):
     logfile = sys.stderr
 
     if args.offline and args.comp_unaware:
-        print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile)
+        logger.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
         sys.exit(1)
 
+    if args.log_level:
+        logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s',
+                            level=getattr(logging, args.log_level))
+
     audio_path = args.audio_path
 
     SAMPLING_RATE = 16000
     duration = len(load_audio(audio_path))/SAMPLING_RATE
-    print("Audio duration is: %2.2f seconds" % duration, file=logfile)
+    logger.info("Audio duration is: %2.2f seconds" % duration)
 
     asr, online = asr_factory(args, logfile=logfile)
     min_chunk = args.min_chunk_size
@@ -645,16 +660,16 @@ def output_transcript(o, now=None):
             print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True)
             print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True)
         else:
-            print(o,file=logfile,flush=True)
+            # No text, so no output
+            pass
 
     if args.offline: ## offline mode processing (for testing/debugging)
         a = load_audio(audio_path)
         online.insert_audio_chunk(a)
         try:
             o = online.process_iter()
-        except AssertionError:
-            print("assertion error",file=logfile)
-            pass
+        except AssertionError as e:
+            log.error(f"assertion error: {repr(e)}")
         else:
             output_transcript(o)
         now = None
@@ -665,13 +680,13 @@ def output_transcript(o, now=None):
             online.insert_audio_chunk(a)
             try:
                 o = online.process_iter()
-            except AssertionError:
-                print("assertion error",file=logfile)
+            except AssertionError as e:
+                logger.error(f"assertion error: {repr(e)}")
                 pass
             else:
                 output_transcript(o, now=end)
 
-            print(f"## last processed {end:.2f}s",file=logfile,flush=True)
+            logger.debug(f"## last processed {end:.2f}s")
 
             if end >= duration:
                 break
@@ -697,13 +712,13 @@ def output_transcript(o, now=None):
 
             try:
                 o = online.process_iter()
-            except AssertionError:
-                print("assertion error",file=logfile)
+            except AssertionError as e:
+                logger.error(f"assertion error: {e}")
                 pass
             else:
                 output_transcript(o)
             now = time.time() - start
-            print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True)
+            logger.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
 
             if end >= duration:
                 break

diff --git a/whisper_online_server.py b/whisper_online_server.py
@@ -4,7 +4,10 @@
 import sys
 import argparse
 import os
+import logging
 import numpy as np
+
+logger = logging.getLogger(__name__)
 parser = argparse.ArgumentParser()
 
 # server options
@@ -18,6 +21,9 @@
 add_shared_args(parser)
 args = parser.parse_args()
 
+if args.log_level:
+    logging.basicConfig(format='whisper-server-%(levelname)s:%(name)s: %(message)s',
+                        level=getattr(logging, args.log_level))
 
 # setting whisper object by args 
 
@@ -28,35 +34,25 @@
 asr, online = asr_factory(args)
 min_chunk = args.min_chunk_size
 
-
-if args.buffer_trimming == "sentence":
-    tokenizer = create_tokenizer(tgt_language)
-else:
-    tokenizer = None
-online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
-
 # warm up the ASR because the very first transcribe takes more time than the others. 
 # Test results in https://github.com/ufal/whisper_streaming/pull/81
 msg = "Whisper is not warmed up. The first chunk processing may take longer."
 if args.warmup_file:
     if os.path.isfile(args.warmup_file):
         a = load_audio_chunk(args.warmup_file,0,1)
         asr.transcribe(a)
-        print("INFO: Whisper is warmed up.",file=sys.stderr)
+        logger.info("Whisper is warmed up.")
     else:
-        print("WARNING: The warm up file is not available. "+msg,file=sys.stderr)
+        logger.warning("The warm up file is not available. "+msg)
 else:
-    print("WARNING: " + msg, file=sys.stderr)
+    logger.warning(msg)
 
 
 ######### Server objects
 
 import line_packet
 import socket
 
-import logging
-
-
 class Connection:
     '''it wraps conn object'''
     PACKET_SIZE = 65536
@@ -104,8 +100,6 @@ def receive_audio_chunk(self):
         out = []
         while sum(len(x) for x in out) < self.min_chunk*SAMPLING_RATE:
             raw_bytes = self.connection.non_blocking_receive_audio()
-            print(raw_bytes[:10])
-            print(len(raw_bytes))
             if not raw_bytes:
                 break
             sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
@@ -136,7 +130,7 @@ def format_output_transcript(self,o):
             print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
             return "%1.0f %1.0f %s" % (beg,end,o[2])
         else:
-            print(o,file=sys.stderr,flush=True)
+            logger.debug("No text in this segment")
             return None
 
     def send_result(self, o):
@@ -150,39 +144,33 @@ def process(self):
         while True:
             a = self.receive_audio_chunk()
             if a is None:
-                print("break here",file=sys.stderr)
                 break
             self.online_asr_proc.insert_audio_chunk(a)
             o = online.process_iter()
             try:
                 self.send_result(o)
             except BrokenPipeError:
-                print("broken pipe -- connection closed?",file=sys.stderr)
+                logger.info("broken pipe -- connection closed?")
                 break
 
 #        o = online.finish()  # this should be working
 #        self.send_result(o)
 
 
 
-
-# Start logging.
-level = logging.INFO
-logging.basicConfig(level=level, format='whisper-server-%(levelname)s: %(message)s')
-
 # server loop
 
 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
     s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     s.bind((args.host, args.port))
     s.listen(1)
-    logging.info('INFO: Listening on'+str((args.host, args.port)))
+    logger.info('Listening on'+str((args.host, args.port)))
     while True:
         conn, addr = s.accept()
-        logging.info('INFO: Connected to client on {}'.format(addr))
+        logger.info('Connected to client on {}'.format(addr))
         connection = Connection(conn)
         proc = ServerProcessor(connection, online, min_chunk)
         proc.process()
         conn.close()
-        logging.info('INFO: Connection to client closed')
-logging.info('INFO: Connection closed, terminating.')
+        logger.info('Connection to client closed')
+logger.info('Connection closed, terminating.')