sgl-project · Ying1123 · Aug 8, 2024 · Aug 7, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/docs/en/sampling_params.md b/docs/en/sampling_params.md
@@ -33,7 +33,7 @@ The `sampling_params` follows this format
 
 ```python
 # The maximum number of output tokens
-max_new_tokens: int = 16,
+max_new_tokens: int = 128,
 # Stop when hitting any of the strings in this list.
 stop: Optional[Union[str, List[str]]] = None,
 # Sampling temperature
@@ -90,7 +90,7 @@ response = requests.post(
         "text": "The capital of France is",
         "sampling_params": {
             "temperature": 0,
-            "max_new_tokens": 256,
+            "max_new_tokens": 32,
         },
         "stream": True,
     },

@@ -125,7 +125,7 @@ def run_internal(
     def run(
         self,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,
@@ -155,7 +155,7 @@ def run_batch(
         self,
         batch_kwargs,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,

@@ -16,7 +16,7 @@
 
 @dataclasses.dataclass
 class SglSamplingParams:
-    max_new_tokens: int = 16
+    max_new_tokens: int = 128
     stop: Union[str, List[str]] = ()
     temperature: float = 1.0
     top_p: float = 1.0
@@ -140,7 +140,7 @@ def bind(self, **kwargs):
     def run(
         self,
         *args,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,
@@ -179,7 +179,7 @@ def run_batch(
         self,
         batch_kwargs,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,

@@ -23,7 +23,7 @@
 class SamplingParams:
     def __init__(
         self,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Optional[Union[str, List[str]]] = None,
         temperature: float = 1.0,
         top_p: float = 1.0,

diff --git a/scripts/deprecated/test_curl.sh b/scripts/deprecated/test_curl.sh
@@ -3,7 +3,7 @@ curl http://localhost:30000/generate \
   -d '{
     "text": "Once upon a time,",
     "sampling_params": {
-      "max_new_tokens": 16,
+      "max_new_tokens": 64,
       "temperature": 0
     }
   }'
diff --git a/scripts/deprecated/test_httpserver_llava.py b/scripts/deprecated/test_httpserver_llava.py
@@ -36,7 +36,7 @@ async def test_concurrent(args):
                     "image_data": "example_image.png",
                     "sampling_params": {
                         "temperature": 0,
-                        "max_new_tokens": 16,
+                        "max_new_tokens": 64,
                     },
                 },
             )