llama.cpp 单次正常处理过程

日志解释

# 请求:代理收到,重定向转发给模型
srv proxy_reques: proxying request to model Qwen3.5-27B on port 43981
# 解析:模型将按照特定模板解析输入
srv params_from_: Chat format: peg-native
# 队列:使用LRU (Least Recently Used) 算法,选择槽位来处理
slot get_availabl: id  1 | task -1 | selected slot by LRU
# 缓存:预缓存Prompt(约 9.3 万 token),占用约 576MB 内存。
srv prompt_save: - saving prompt with length 93153, total state size = 576.295 MiB
# 缓存:命中,复用
srv load: - found better prompt with f_keep = 0.983, sim = 0.824
# 缓存:缓存已满达到上限,删掉旧缓存,内存置换
srv update:  - cache size limit reached, removing oldest entry (size = 1661.273 MiB)
# 算法:配置生成算法。定义了模型如何从概率分布中选择下一个字(包括惩罚重复、Top-K、Top-P、温度调节等步骤)。
slot launch_slot_: id  1 | task -1 | sampler chain: logits -> penalties -> ?dry -> ?top-n-sigma -> top-k -> ?typical -> top-p -> min-p -> ?xtc -> temp-ext -> dist 
# 推理:正式开启编号喂73503任务
slot launch_slot_: id 1 | task 73503 | processing task
# 推理:本次新提示词的长度
slot update_slots: id  1 | task 73503 | new prompt, n_ctx_slot = 102400, n_keep = 0, task.n_tokens = 40322
# 推理:发现可复用n_past
slot update_slots: id  1 | task 73503 | n_past = 33235, slot.prompt.tokens.size() = 33801, seq_id = 1, pos_min = 33800, n_swa = 0
# 推理:从断点恢复加载,避免n_past个token重复计算
slot update_slots: id  1 | task 73503 | restored context checkpoint (pos_min = 33234, pos_max = 33234, n_tokens = 33235, n_past = 33235, size = 62.813 MiB)
# 中断:客户端断开或超时,连接被取消了
srv operator(): http client error: Connection handling canceled
# 中断:73503任务计算停止,任务取消
srv stop: cancel task, id_task = 73503
# 性能:另一个任务的性能统计
slot print_timing: id 0 | task 73505
# 性能:预处理 87 tokens/s,生成速度 9.57 tokens/s
prompt eval time  =    8806.15 ms /   773 tokens (   11.39 ms per token,    87.78 tokens per second)
       eval time  =   12535.17 ms /   120 tokens (  104.46 ms per token,     9.57 tokens per second)
       total time =   21341.31 ms /   893 tokens

日志

[43981] srv  update_slots: all slots are idle
srv  proxy_reques: proxying request to model Qwen3.5-27B on port 43981
[43981] srv  params_from_: Chat format: peg-native
[43981] slot get_availabl: id  1 | task -1 | selected slot by LRU, t_last = 793598224933
[43981] srv  get_availabl: updating prompt cache
[43981] srv   prompt_save:  - saving prompt with length 93153, total state size = 576.295 MiB
[43981] srv          load:  - looking for better prompt, base f_keep = 0.000, sim = 0.000
[43981] srv          load:  - found better prompt with f_keep = 0.983, sim = 0.824
[43981] srv        update:  - cache size limit reached, removing oldest entry (size = 1661.273 MiB)
[43981] srv        update:  - cache state: 6 prompts, 7035.574 MiB (limits: 8192.000 MiB, 204800 tokens, 371615 est)
[43981] srv        update:    - prompt 0x75194c03b950:   58469 tokens, checkpoints: 17,  1452.933 MiB
[43981] srv        update:    - prompt 0x7518c45350c0:   50939 tokens, checkpoints: 15,  1285.799 MiB
[43981] srv        update:    - prompt 0x5b8ab3f84d00:   59570 tokens, checkpoints: 19,  1584.628 MiB
[43981] srv        update:    - prompt 0x5b8ab60f5fe0:   30425 tokens, checkpoints:  8,   733.029 MiB
[43981] srv        update:    - prompt 0x5b8ab3ba2d40:   26600 tokens, checkpoints:  6,   586.318 MiB
[43981] srv        update:    - prompt 0x75192c5a9100:   93153 tokens, checkpoints: 13,  1392.867 MiB
[43981] srv  get_availabl: prompt cache update took 1176.11 ms
[43981] slot launch_slot_: id  1 | task -1 | sampler chain: logits -> penalties -> ?dry -> ?top-n-sigma -> top-k -> ?typical -> top-p -> min-p -> ?xtc -> temp-ext -> dist 
[43981] slot launch_slot_: id  1 | task 73503 | processing task, is_child = 0
[43981] slot update_slots: id  1 | task 73503 | new prompt, n_ctx_slot = 102400, n_keep = 0, task.n_tokens = 40322
[43981] slot update_slots: id  1 | task 73503 | n_past = 33235, slot.prompt.tokens.size() = 33801, seq_id = 1, pos_min = 33800, n_swa = 0
[43981] slot update_slots: id  1 | task 73503 | Checking checkpoint with [33234, 33234] against 33235...
[43981] slot update_slots: id  1 | task 73503 | restored context checkpoint (pos_min = 33234, pos_max = 33234, n_tokens = 33235, n_past = 33235, size = 62.813 MiB)
[43981] slot update_slots: id  1 | task 73503 | n_tokens = 33235, memory_seq_rm [33235, end)
[43981] slot update_slots: id  1 | task 73503 | prompt processing progress, n_tokens = 37331, batch.n_tokens = 4096, progress = 0.925822
srv  proxy_reques: proxying request to model Qwen3.5-27B on port 43981
[43981] srv  params_from_: Chat format: peg-native
[43981] slot get_availabl: id  0 | task -1 | selected slot by LCP similarity, sim_best = 0.974 (> 0.100 thold), f_keep = 0.997
[43981] slot launch_slot_: id  0 | task -1 | sampler chain: logits -> penalties -> ?dry -> ?top-n-sigma -> top-k -> ?typical -> ?top-p -> min-p -> ?xtc -> temp-ext -> dist 
[43981] slot launch_slot_: id  0 | task 73505 | processing task, is_child = 0
[43981] slot update_slots: id  0 | task 73505 | new prompt, n_ctx_slot = 102400, n_keep = 0, task.n_tokens = 30222
[43981] slot update_slots: id  0 | task 73505 | n_past = 29449, slot.prompt.tokens.size() = 29528, seq_id = 0, pos_min = 29527, n_swa = 0
[43981] slot update_slots: id  0 | task 73505 | Checking checkpoint with [29448, 29448] against 29449...
[43981] slot update_slots: id  0 | task 73505 | restored context checkpoint (pos_min = 29448, pos_max = 29448, n_tokens = 29449, n_past = 29449, size = 62.813 MiB)
[43981] slot update_slots: id  0 | task 73505 | n_tokens = 29449, memory_seq_rm [29449, end)
[43981] slot update_slots: id  0 | task 73505 | prompt processing progress, n_tokens = 30218, batch.n_tokens = 769, progress = 0.999868
[43981] slot update_slots: id  1 | task 73503 | n_tokens = 37331, memory_seq_rm [37331, end)
[43981] slot update_slots: id  1 | task 73503 | prompt processing progress, n_tokens = 39294, batch.n_tokens = 2732, progress = 0.974505
[43981] slot update_slots: id  0 | task 73505 | n_tokens = 30218, memory_seq_rm [30218, end)
[43981] slot init_sampler: id  0 | task 73505 | init sampler, took 8.15 ms, tokens: text = 30222, total = 30222
[43981] slot update_slots: id  0 | task 73505 | prompt processing done, n_tokens = 30222, batch.n_tokens = 4
[43981] slot create_check: id  0 | task 73505 | created context checkpoint 6 of 32 (pos_min = 30217, pos_max = 30217, n_tokens = 30218, size = 62.813 MiB)
[43981] slot update_slots: id  1 | task 73503 | n_tokens = 39294, memory_seq_rm [39294, end)
[43981] slot update_slots: id  1 | task 73503 | prompt processing progress, n_tokens = 40318, batch.n_tokens = 1028, progress = 0.999901
[43981] slot create_check: id  1 | task 73503 | created context checkpoint 7 of 32 (pos_min = 39293, pos_max = 39293, n_tokens = 39294, size = 62.813 MiB)
[43981] slot update_slots: id  1 | task 73503 | n_tokens = 40318, memory_seq_rm [40318, end)
[43981] srv  log_server_r: done request: POST /v1/chat/completions 127.0.0.1 200
srv  log_server_r: done request: POST /v1/chat/completions 127.0.0.1 200
[43981] slot init_sampler: id  1 | task 73503 | init sampler, took 10.06 ms, tokens: text = 40322, total = 40322
[43981] slot update_slots: id  1 | task 73503 | prompt processing done, n_tokens = 40322, batch.n_tokens = 5
[43981] slot create_check: id  1 | task 73503 | created context checkpoint 8 of 32 (pos_min = 40317, pos_max = 40317, n_tokens = 40318, size = 62.813 MiB)
[43981] srv  log_server_r: done request: POST /v1/chat/completions 127.0.0.1 200
srv  log_server_r: done request: POST /v1/chat/completions 127.0.0.1 200
srv    operator(): http client error: Connection handling canceled
[43981] srv          stop: cancel task, id_task = 73503
[43981] slot      release: id  1 | task 73503 | stop processing: n_tokens = 40330, truncated = 0
[43981] slot print_timing: id  0 | task 73505 | 
[43981] prompt eval time =    8806.15 ms /   773 tokens (   11.39 ms per token,    87.78 tokens per second)
[43981]        eval time =   12535.17 ms /   120 tokens (  104.46 ms per token,     9.57 tokens per second)
[43981]       total time =   21341.31 ms /   893 tokens
[43981] slot      release: id  0 | task 73505 | stop processing: n_tokens = 30341, truncated = 0
[43981] srv  update_slots: all slots are idle