You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
(pytorch) root@node03:/mnt/pengfeng/perfxcloud/enpoints/solar-10.7b-instruct# CUDA_VISIBLE_DEVICES=0,1 bentoml serve . --port 2999
2024-04-28T04:44:35+0000 [WARNING] [cli] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:36+0000 [INFO] [cli] Prometheus metrics for HTTP BentoServer from "." can be accessed at http://localhost:2999/metrics.
2024-04-28T04:44:36+0000 [INFO] [cli] Starting production HTTP BentoServer from "." listening on http://0.0.0.0:2999 (Press CTRL+C to quit)
2024-04-28T04:44:41+0000 [WARNING] [api_server:15] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:5] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:13] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:6] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:14] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:1] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:4] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:8] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:9] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:10] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:7] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:16] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:2] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:12] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [runner:llm-llama-runner:1] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:31] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:17] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:11] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:26] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:28] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:18] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:29] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:21] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:20] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:24] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:22] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:27] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:32] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:30] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:23] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:25] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:3] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
num_gpus:
1
test
1
2
2024-04-28T04:44:42+0000 [WARNING] [api_server:19] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
True
4
INFO 04-28 04:44:42 llm_engine.py:87] Initializing an LLM engine with config: model='/root/bentoml/models/vllm-solar-10.7b-instruct-v1.0/47fbfdfc698c2533cc0d6162f219eea0ac9f3cbc', tokenizer='/root/bentoml/models/vllm-solar-10.7b-instruct-v1.0/47fbfdfc698c2533cc0d6162f219eea0ac9f3cbc', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0428 04:44:42.955093 4168285 ProcessGroupNCCL.cpp:686] [Rank 0] ProcessGroupNCCL initialization options:NCCL_ASYNC_ERROR_HANDLING: 1, NCCL_DESYNC_DEBUG: 0, NCCL_ENABLE_TIMING: 0, NCCL_BLOCKING_WAIT: 0, TIMEOUT(ms): 1800000, USE_HIGH_PRIORITY_STREAM: 0, TORCH_DISTRIBUTED_DEBUG: OFF, NCCL_DEBUG: OFF, ID=94919030298176
I0428 04:44:43.430707 4168285 ProcessGroupNCCL.cpp:1340] NCCL_DEBUG: N/A
I0428 04:44:43.452649 4168285 ProcessGroupNCCL.cpp:686] [Rank 0] ProcessGroupNCCL initialization options:NCCL_ASYNC_ERROR_HANDLING: 1, NCCL_DESYNC_DEBUG: 0, NCCL_ENABLE_TIMING: 0, NCCL_BLOCKING_WAIT: 0, TIMEOUT(ms): 1800000, USE_HIGH_PRIORITY_STREAM: 0, TORCH_DISTRIBUTED_DEBUG: OFF, NCCL_DEBUG: OFF, ID=94919030671728
I0428 04:44:43.452975 4168285 ProcessGroupNCCL.cpp:686] [Rank 0] ProcessGroupNCCL initialization options:NCCL_ASYNC_ERROR_HANDLING: 1, NCCL_DESYNC_DEBUG: 0, NCCL_ENABLE_TIMING: 0, NCCL_BLOCKING_WAIT: 0, TIMEOUT(ms): 1800000, USE_HIGH_PRIORITY_STREAM: 0, TORCH_DISTRIBUTED_DEBUG: OFF, NCCL_DEBUG: OFF, ID=94919030620976
2024-04-28T04:44:43+0000 [WARNING] [runner:llm-llama-runner:1] WARNING[XFORMERS]: xFormers can't load C++/CUDA extensions. xFormers was built for:
PyTorch 2.2.2+cu121 with CUDA 1201 (you have 2.1.0a0)
Python 3.8.19 (you have 3.8.18)
Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
Memory-efficient attention, SwiGLU, sparse and more won't be available.
Set XFORMERS_MORE_DETAILS=1 for more details
Traceback (most recent call last):
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/openllm/_runners.py", line 133, in init
self.model = vllm.AsyncLLMEngine.from_engine_args(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 628, in from_engine_args
engine = cls(parallel_config.worker_use_ray,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 321, in init
self.engine = self._init_engine(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 369, in _init_engine
return engine_class(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 128, in init
self._init_workers()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 181, in _init_workers
self._run_workers("load_model")
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 1041, in _run_workers
driver_worker_output = getattr(self.driver_worker,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/worker.py", line 100, in load_model
self.model_runner.load_model()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/model_runner.py", line 88, in load_model
self.model = get_model(self.model_config,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/utils.py", line 52, in get_model
return get_model_fn(model_config, device_config, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/model_loader.py", line 70, in get_model
model = model_class(model_config.hf_config, linear_method,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 315, in init
self.model = LlamaModel(config, linear_method, lora_config=lora_config)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 250, in init
self.layers = nn.ModuleList([
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 251, in
LlamaDecoderLayer(config, linear_method)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 190, in init
self.mlp = LlamaMLP(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 67, in init
self.down_proj = RowParallelLinear(intermediate_size,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 523, in init
self.linear_weights = self.linear_method.create_weights(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 63, in create_weights
weight = Parameter(torch.empty(output_size_per_partition,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/torch/utils/_device.py", line 77, in torch_function
return func(*args, **kwargs)
torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 15.98 GiB of which 0 bytes is free. Of the allocated memory 971.02 MiB is allocated by PyTorch, and 1008.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_HIP_ALLOC_CONF
2024-04-28T04:44:45+0000 [ERROR] [runner:llm-llama-runner:1] An exception occurred while instantiating runner 'llm-llama-runner', see details below:
2024-04-28T04:44:45+0000 [ERROR] [runner:llm-llama-runner:1] Traceback (most recent call last):
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/openllm/_runners.py", line 133, in init
self.model = vllm.AsyncLLMEngine.from_engine_args(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 628, in from_engine_args
engine = cls(parallel_config.worker_use_ray,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 321, in init
self.engine = self._init_engine(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 369, in _init_engine
return engine_class(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 128, in init
self._init_workers()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 181, in _init_workers
self._run_workers("load_model")
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 1041, in _run_workers
driver_worker_output = getattr(self.driver_worker,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/worker.py", line 100, in load_model
self.model_runner.load_model()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/model_runner.py", line 88, in load_model
self.model = get_model(self.model_config,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/utils.py", line 52, in get_model
return get_model_fn(model_config, device_config, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/model_loader.py", line 70, in get_model
model = model_class(model_config.hf_config, linear_method,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 315, in init
self.model = LlamaModel(config, linear_method, lora_config=lora_config)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 250, in init
self.layers = nn.ModuleList([
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 251, in
LlamaDecoderLayer(config, linear_method)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 190, in init
self.mlp = LlamaMLP(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 67, in init
self.down_proj = RowParallelLinear(intermediate_size,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 523, in init
self.linear_weights = self.linear_method.create_weights(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 63, in create_weights
weight = Parameter(torch.empty(output_size_per_partition,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/torch/utils/_device.py", line 77, in torch_function
return func(*args, **kwargs)
torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 15.98 GiB of which 0 bytes is free. Of the allocated memory 971.02 MiB is allocated by PyTorch, and 1008.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_HIP_ALLOC_CONF
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/bentoml/_internal/runner/runner.py", line 311, in init_local
self._set_handle(LocalRunnerRef)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/bentoml/_internal/runner/runner.py", line 153, in _set_handle
runner_handle = handle_class(self, *args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/bentoml/_internal/runner/runner_handle/local.py", line 27, in init
self._runnable = runner.runnable_class(**runner.runnable_init_params) # type: ignore
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/openllm/_runners.py", line 150, in init
raise openllm.exceptions.OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
openllm_core.exceptions.OpenLLMException: Failed to initialise vLLMEngine due to the following error:
HIP out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 15.98 GiB of which 0 bytes is free. Of the allocated memory 971.02 MiB is allocated by PyTorch, and 1008.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_HIP_ALLOC_CONF
2024-04-28T04:44:45+0000 [ERROR] [runner:llm-llama-runner:1] Traceback (most recent call last):
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/openllm/_runners.py", line 133, in init
self.model = vllm.AsyncLLMEngine.from_engine_args(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 628, in from_engine_args
engine = cls(parallel_config.worker_use_ray,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 321, in init
self.engine = self._init_engine(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 369, in _init_engine
return engine_class(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 128, in init
self._init_workers()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 181, in _init_workers
self._run_workers("load_model")
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 1041, in _run_workers
driver_worker_output = getattr(self.driver_worker,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/worker.py", line 100, in load_model
self.model_runner.load_model()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/model_runner.py", line 88, in load_model
self.model = get_model(self.model_config,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/utils.py", line 52, in get_model
return get_model_fn(model_config, device_config, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/model_loader.py", line 70, in get_model
model = model_class(model_config.hf_config, linear_method,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 315, in init
self.model = LlamaModel(config, linear_method, lora_config=lora_config)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 250, in init
self.layers = nn.ModuleList([
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 251, in
LlamaDecoderLayer(config, linear_method)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 190, in init
self.mlp = LlamaMLP(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 67, in init
self.down_proj = RowParallelLinear(intermediate_size,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 523, in init
self.linear_weights = self.linear_method.create_weights(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 63, in create_weights
weight = Parameter(torch.empty(output_size_per_partition,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/torch/utils/_device.py", line 77, in torch_function
return func(*args, **kwargs)
torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 15.98 GiB of which 0 bytes is free. Of the allocated memory 971.02 MiB is allocated by PyTorch, and 1008.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_HIP_ALLOC_CONF
the api_server.py as follow
from future import annotations
import uuid
from typing import Any, AsyncGenerator, Dict, TypedDict, Union
from bentoml import Service
from bentoml.io import JSON, Text
from openllm import LLM
async def streamer() -> AsyncGenerator[str, None]:
async for request_output in generator:
for output in request_output.outputs:
i = output.index
previous_texts[i].append(output.text)
yield output.text
if request['stream']:
return streamer()
async for _ in streamer(): pass
return ''.join(previous_texts[0])
The text was updated successfully, but these errors were encountered:
(pytorch) root@node03:/mnt/pengfeng/perfxcloud/enpoints/solar-10.7b-instruct# CUDA_VISIBLE_DEVICES=0,1 bentoml serve . --port 2999
2024-04-28T04:44:35+0000 [WARNING] [cli] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:36+0000 [INFO] [cli] Prometheus metrics for HTTP BentoServer from "." can be accessed at http://localhost:2999/metrics.
2024-04-28T04:44:36+0000 [INFO] [cli] Starting production HTTP BentoServer from "." listening on http://0.0.0.0:2999 (Press CTRL+C to quit)
2024-04-28T04:44:41+0000 [WARNING] [api_server:15] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:5] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:13] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:6] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:14] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:1] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:4] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:8] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:9] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:10] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:7] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:41+0000 [WARNING] [api_server:16] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:2] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:12] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [runner:llm-llama-runner:1] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:31] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:17] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:11] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:26] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:28] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:18] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:29] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:21] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:20] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:24] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:22] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:27] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:32] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:30] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:23] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:25] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
2024-04-28T04:44:42+0000 [WARNING] [api_server:3] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
num_gpus:
1
test
1
2
2024-04-28T04:44:42+0000 [WARNING] [api_server:19] Converting SOLAR-10.7B-Instruct-v1.0 to lowercase: solar-10.7b-instruct-v1.0.
True
4
INFO 04-28 04:44:42 llm_engine.py:87] Initializing an LLM engine with config: model='/root/bentoml/models/vllm-solar-10.7b-instruct-v1.0/47fbfdfc698c2533cc0d6162f219eea0ac9f3cbc', tokenizer='/root/bentoml/models/vllm-solar-10.7b-instruct-v1.0/47fbfdfc698c2533cc0d6162f219eea0ac9f3cbc', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0428 04:44:42.955093 4168285 ProcessGroupNCCL.cpp:686] [Rank 0] ProcessGroupNCCL initialization options:NCCL_ASYNC_ERROR_HANDLING: 1, NCCL_DESYNC_DEBUG: 0, NCCL_ENABLE_TIMING: 0, NCCL_BLOCKING_WAIT: 0, TIMEOUT(ms): 1800000, USE_HIGH_PRIORITY_STREAM: 0, TORCH_DISTRIBUTED_DEBUG: OFF, NCCL_DEBUG: OFF, ID=94919030298176
I0428 04:44:43.430707 4168285 ProcessGroupNCCL.cpp:1340] NCCL_DEBUG: N/A
I0428 04:44:43.452649 4168285 ProcessGroupNCCL.cpp:686] [Rank 0] ProcessGroupNCCL initialization options:NCCL_ASYNC_ERROR_HANDLING: 1, NCCL_DESYNC_DEBUG: 0, NCCL_ENABLE_TIMING: 0, NCCL_BLOCKING_WAIT: 0, TIMEOUT(ms): 1800000, USE_HIGH_PRIORITY_STREAM: 0, TORCH_DISTRIBUTED_DEBUG: OFF, NCCL_DEBUG: OFF, ID=94919030671728
I0428 04:44:43.452975 4168285 ProcessGroupNCCL.cpp:686] [Rank 0] ProcessGroupNCCL initialization options:NCCL_ASYNC_ERROR_HANDLING: 1, NCCL_DESYNC_DEBUG: 0, NCCL_ENABLE_TIMING: 0, NCCL_BLOCKING_WAIT: 0, TIMEOUT(ms): 1800000, USE_HIGH_PRIORITY_STREAM: 0, TORCH_DISTRIBUTED_DEBUG: OFF, NCCL_DEBUG: OFF, ID=94919030620976
2024-04-28T04:44:43+0000 [WARNING] [runner:llm-llama-runner:1] WARNING[XFORMERS]: xFormers can't load C++/CUDA extensions. xFormers was built for:
PyTorch 2.2.2+cu121 with CUDA 1201 (you have 2.1.0a0)
Python 3.8.19 (you have 3.8.18)
Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
Memory-efficient attention, SwiGLU, sparse and more won't be available.
Set XFORMERS_MORE_DETAILS=1 for more details
Traceback (most recent call last):
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/openllm/_runners.py", line 133, in init
self.model = vllm.AsyncLLMEngine.from_engine_args(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 628, in from_engine_args
engine = cls(parallel_config.worker_use_ray,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 321, in init
self.engine = self._init_engine(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 369, in _init_engine
return engine_class(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 128, in init
self._init_workers()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 181, in _init_workers
self._run_workers("load_model")
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 1041, in _run_workers
driver_worker_output = getattr(self.driver_worker,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/worker.py", line 100, in load_model
self.model_runner.load_model()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/model_runner.py", line 88, in load_model
self.model = get_model(self.model_config,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/utils.py", line 52, in get_model
return get_model_fn(model_config, device_config, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/model_loader.py", line 70, in get_model
model = model_class(model_config.hf_config, linear_method,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 315, in init
self.model = LlamaModel(config, linear_method, lora_config=lora_config)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 250, in init
self.layers = nn.ModuleList([
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 251, in
LlamaDecoderLayer(config, linear_method)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 190, in init
self.mlp = LlamaMLP(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 67, in init
self.down_proj = RowParallelLinear(intermediate_size,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 523, in init
self.linear_weights = self.linear_method.create_weights(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 63, in create_weights
weight = Parameter(torch.empty(output_size_per_partition,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/torch/utils/_device.py", line 77, in torch_function
return func(*args, **kwargs)
torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 15.98 GiB of which 0 bytes is free. Of the allocated memory 971.02 MiB is allocated by PyTorch, and 1008.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_HIP_ALLOC_CONF
2024-04-28T04:44:45+0000 [ERROR] [runner:llm-llama-runner:1] An exception occurred while instantiating runner 'llm-llama-runner', see details below:
2024-04-28T04:44:45+0000 [ERROR] [runner:llm-llama-runner:1] Traceback (most recent call last):
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/openllm/_runners.py", line 133, in init
self.model = vllm.AsyncLLMEngine.from_engine_args(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 628, in from_engine_args
engine = cls(parallel_config.worker_use_ray,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 321, in init
self.engine = self._init_engine(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 369, in _init_engine
return engine_class(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 128, in init
self._init_workers()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 181, in _init_workers
self._run_workers("load_model")
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 1041, in _run_workers
driver_worker_output = getattr(self.driver_worker,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/worker.py", line 100, in load_model
self.model_runner.load_model()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/model_runner.py", line 88, in load_model
self.model = get_model(self.model_config,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/utils.py", line 52, in get_model
return get_model_fn(model_config, device_config, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/model_loader.py", line 70, in get_model
model = model_class(model_config.hf_config, linear_method,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 315, in init
self.model = LlamaModel(config, linear_method, lora_config=lora_config)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 250, in init
self.layers = nn.ModuleList([
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 251, in
LlamaDecoderLayer(config, linear_method)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 190, in init
self.mlp = LlamaMLP(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 67, in init
self.down_proj = RowParallelLinear(intermediate_size,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 523, in init
self.linear_weights = self.linear_method.create_weights(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 63, in create_weights
weight = Parameter(torch.empty(output_size_per_partition,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/torch/utils/_device.py", line 77, in torch_function
return func(*args, **kwargs)
torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 15.98 GiB of which 0 bytes is free. Of the allocated memory 971.02 MiB is allocated by PyTorch, and 1008.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_HIP_ALLOC_CONF
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/bentoml/_internal/runner/runner.py", line 311, in init_local
self._set_handle(LocalRunnerRef)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/bentoml/_internal/runner/runner.py", line 153, in _set_handle
runner_handle = handle_class(self, *args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/bentoml/_internal/runner/runner_handle/local.py", line 27, in init
self._runnable = runner.runnable_class(**runner.runnable_init_params) # type: ignore
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/openllm/_runners.py", line 150, in init
raise openllm.exceptions.OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
openllm_core.exceptions.OpenLLMException: Failed to initialise vLLMEngine due to the following error:
HIP out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 15.98 GiB of which 0 bytes is free. Of the allocated memory 971.02 MiB is allocated by PyTorch, and 1008.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_HIP_ALLOC_CONF
2024-04-28T04:44:45+0000 [ERROR] [runner:llm-llama-runner:1] Traceback (most recent call last):
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/openllm/_runners.py", line 133, in init
self.model = vllm.AsyncLLMEngine.from_engine_args(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 628, in from_engine_args
engine = cls(parallel_config.worker_use_ray,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 321, in init
self.engine = self._init_engine(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/async_llm_engine.py", line 369, in _init_engine
return engine_class(*args, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 128, in init
self._init_workers()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 181, in _init_workers
self._run_workers("load_model")
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/engine/llm_engine.py", line 1041, in _run_workers
driver_worker_output = getattr(self.driver_worker,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/worker.py", line 100, in load_model
self.model_runner.load_model()
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/worker/model_runner.py", line 88, in load_model
self.model = get_model(self.model_config,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/utils.py", line 52, in get_model
return get_model_fn(model_config, device_config, **kwargs)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/model_loader.py", line 70, in get_model
model = model_class(model_config.hf_config, linear_method,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 315, in init
self.model = LlamaModel(config, linear_method, lora_config=lora_config)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 250, in init
self.layers = nn.ModuleList([
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 251, in
LlamaDecoderLayer(config, linear_method)
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 190, in init
self.mlp = LlamaMLP(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/models/llama.py", line 67, in init
self.down_proj = RowParallelLinear(intermediate_size,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 523, in init
self.linear_weights = self.linear_method.create_weights(
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/vllm-0.3.3+gitunknown.abi1.dtk2310.torch2.1-py3.8-linux-x86_64.egg/vllm/model_executor/layers/linear.py", line 63, in create_weights
weight = Parameter(torch.empty(output_size_per_partition,
File "/root/miniconda3/envs/pytorch/lib/python3.8/site-packages/torch/utils/_device.py", line 77, in torch_function
return func(*args, **kwargs)
torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 15.98 GiB of which 0 bytes is free. Of the allocated memory 971.02 MiB is allocated by PyTorch, and 1008.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_HIP_ALLOC_CONF
the api_server.py as follow
from future import annotations
import uuid
from typing import Any, AsyncGenerator, Dict, TypedDict, Union
from bentoml import Service
from bentoml.io import JSON, Text
from openllm import LLM
llm = LLM[Any, Any]('/mnt/pengfeng/perfxcloud/models/SOLAR-10.7B-Instruct-v1.0', backend='vllm')
svc = Service('SOLAR-10.7B-Instruct-v1.0', runners=[llm.runner])
class GenerateInput(TypedDict):
prompt: str
stream: bool
sampling_params: Dict[str, Any]
@svc.api(
route='/v1/generate',
input=JSON.from_sample(
GenerateInput(prompt='What is time?', stream=False, sampling_params={'temperature': 0.73, 'logprobs': 1})
),
output=Text(content_type='text/event-stream'),
)
async def generate(request: GenerateInput) -> Union[AsyncGenerator[str, None], str]:
n = request['sampling_params'].pop('n', 1)
request_id = f'SOLAR-10.7B-Instruct-v1.0-{uuid.uuid4().hex}'
previous_texts = [[]] * n
generator = llm.generate_iterator(request['prompt'], request_id=request_id, n=n, **request['sampling_params'])
async def streamer() -> AsyncGenerator[str, None]:
async for request_output in generator:
for output in request_output.outputs:
i = output.index
previous_texts[i].append(output.text)
yield output.text
if request['stream']:
return streamer()
async for _ in streamer(): pass
return ''.join(previous_texts[0])
The text was updated successfully, but these errors were encountered: