sgl-project · merrymercy · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
@@ -29,7 +29,6 @@
     BatchStrOut,
     BatchTokenIDOut,
 )
-from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import configure_logger, get_zmq_socket
 from sglang.utils import find_printable_text, get_exception_traceback

@@ -1198,6 +1198,7 @@ def stream_output(
             decode_ids_list = []
             read_offsets = []
             output_ids = []
+
             skip_special_tokens = []
             spaces_between_special_tokens = []
             no_stop_trim = []

@@ -623,23 +623,23 @@ async def handle_loop(self):
                             i,
                         )
 
+                    if not isinstance(recv_obj, BatchEmbeddingOut):
+                        meta_info.update(
+                            {
+                                "completion_tokens": recv_obj.completion_tokens[i],
+                                "cached_tokens": recv_obj.cached_tokens[i],
+                            }
+                        )
+
                     if isinstance(recv_obj, BatchStrOut):
                         out_dict = {
                             "text": recv_obj.output_strs[i],
-                            "meta_info": {
-                                **meta_info,
-                                "completion_tokens": recv_obj.completion_tokens[i],
-                                "cached_tokens": recv_obj.cached_tokens[i],
-                            },
+                            "meta_info": meta_info,
                         }
                     elif isinstance(recv_obj, BatchTokenIDOut):
                         out_dict = {
                             "token_ids": recv_obj.output_ids[i],
-                            "meta_info": {
-                                **meta_info,
-                                "completion_tokens": recv_obj.completion_tokens[i],
-                                "cached_tokens": recv_obj.cached_tokens[i],
-                            },
+                            "meta_info": meta_info,
                         }
                     else:
                         assert isinstance(recv_obj, BatchEmbeddingOut)

@@ -114,7 +114,7 @@ def __init__(
             server_args.chunked_prefill_size = -1
             self.mem_fraction_static *= 0.95
             logger.info(
-                f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static} "
+                f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
                 f"and turn off chunked prefill "
                 f"because this is a multimodal model."
             )