00000193
apt-get install git-lfs 或者yum install git-lfs
cd /home/deepseek-ai/
git clone https://www.modelscope.cn/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B.git
git clone https://www.modelscope.cn/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B.git
git clone https://www.modelscope.cn/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B.git
git clone https://www.modelscope.cn/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B.git
git clone https://www.modelscope.cn/deepseek-ai/DeepSeek-R1-Distill-Llama-8B
git clone https://www.modelscope.cn/deepseek-ai/DeepSeek-R1-Distill-Llama-70B
120.220.95.189 zibo.harbor.iluvatar.com.cn
{
"exec-opts": ["native.cgroupdriver=systemd"],
"insecure-registries": ["zibo.harbor.iluvatar.com.cn:30000"]
}
docker pull zibo.harbor.iluvatar.com.cn:30000/saas/bi100-3.2.1-aarch64-ubuntu20.04-py3.10-poc-llm-infer:v1.2.2
docker run -it -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home:/home -p 1000-1999:1000-1999 --name=test --pid=host --ipc=host --privileged --cap-add=ALL --pid=host zibo.harbor.iluvatar.com.cn:30000/saas/bi100-3.2.1-aarch64-ubuntu20.04-py3.10-poc-llm-infer:v1.2.2 /bin/bash
cd /root/apps/llm-modelzoo/inference/Qwen/vllm
python3 offline_inference.py --model /home/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/ --max-tokens 256 --max-model-len 2048 -tp 1 --temperature 0.55
python3 offline_inference.py --model /home/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/ --max-tokens 256 --trust-remote-code --temperature 0.55 --max-model-len 2048
python3 offline_inference.py --model /home/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/ --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.55 --max-model-len 2048
python3 offline_inference.py --model /home/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/ --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.55 --max-model-len 2048
cd /root/apps/llm-modelzoo/inference/LLama/vllm
python3 offline_inference.py --model /home/deepseek-ai/DeepSeek-R1-Distill-Llama-8B --max-tokens 256 -tp 1 --trust-remote-code --temperature 0.55 --max-model-len 8192
python3 offline_inference.py --model /home/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/ --max-tokens 256 -tp 8 --trust-remote-code --temperature 0.55 --distributed-executor-backend ray --max-seq-len 8192 --gpu-memory-utilization 0.97 --max-model-len 8192
cd ~/apps/llm-modelzoo/benchmark/vllm
# server 端
python3 -m vllm.entrypoints.openai.api_server --model /home/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/ --gpu-memory-utilization 0.9 --max-num-batched-tokens 5120 --max-model-len 2048 --max-num-seqs 256 --host 0.0.0.0 --port 1234 --trust-remote-code
python3 -m vllm.entrypoints.openai.api_server --model /home/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/ --gpu-memory-utilization 0.9 --max-num-batched-tokens 5120 --max-model-len 2048 --max-num-seqs 256 --host 0.0.0.0 --port 1234 --trust-remote-code
python3 -m vllm.entrypoints.openai.api_server --model /home/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/ --gpu-memory-utilization 0.9 --max-num-batched-tokens 5120 --max-model-len 2048 --max-num-seqs 256 -tp 2 --host 0.0.0.0 --port 1234 --trust-remote-code
python3 -m vllm.entrypoints.openai.api_server --model /home/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/ --gpu-memory-utilization 0.9 --max-num-batched-tokens 5120 --max-model-len 2048 --max-num-seqs 256 -tp 4 --host 0.0.0.0 --port 1234 --trust-remote-code
python3 -m vllm.entrypoints.openai.api_server --model /home/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/ --gpu-memory-utilization 0.9 --max-num-batched-tokens 8192 --max-model-len 8192 --max-num-seqs 256 -tp 1 --host 127.0.0.1 --port 1234
python3 -m vllm.entrypoints.openai.api_server --model /home/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/ --gpu-memory-utilization 0.9 --max-num-batched-tokens 8192 --max-model-len 8192 --max-num-seqs 256 -tp 8 --host 127.0.0.1 --port 1234
python3 benchmark_server_openapi.py --tokenizer /home/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/ --host 0.0.0.0 --port 1234 --num-prompts 32 --input-tokens 256 --output-tokens 128 --time-interval 0.5 --trust-remote-code --segments 50,90,99
python3 benchmark_server_openapi.py --tokenizer /home/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/ --host 0.0.0.0 --port 1234 --num-prompts 32 --input-tokens 256 --output-tokens 128 --time-interval 0.5 --trust-remote-code --segments 50,90,99
python3 benchmark_server_openapi.py --tokenizer /home/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/ --host 0.0.0.0 --port 1234 --num-prompts 32 --input-tokens 256 --output-tokens 128 --time-interval 0.5 --trust-remote-code --segments 50,90,99
python3 benchmark_server_openapi.py --tokenizer /home/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/ --host 0.0.0.0 --port 1234 --num-prompts 32 --input-tokens 256 --output-tokens 128 --time-interval 0.5 --trust-remote-code --segments 50,90,99
python3 benchmark_server_openapi.py --tokenizer /home/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/ --host 0.0.0.0 --port 1234 --num-prompts 32 --input-tokens 256 --output-tokens 128 --time-interval 0.5 --trust-remote-code --segments 50,90,99