Remove 'n_gqa' Related Code (#732)
This commit is contained in:
parent
e87d0209c8
commit
782ddf3717
@ -8,7 +8,6 @@ class ChatParameters(BaseModel):
|
||||
model_path: str
|
||||
n_ctx: int
|
||||
n_gpu_layers: int
|
||||
n_gqa: int | None
|
||||
# n_parts: int
|
||||
# seed: int
|
||||
# f16_kv: bool
|
||||
|
||||
@ -25,7 +25,6 @@ async def create_new_chat(
|
||||
max_length: int = 2048,
|
||||
context_window: int = 2048,
|
||||
gpu_layers: Optional[int] = None,
|
||||
gqa: int | None = 0,
|
||||
repeat_last_n: int = 64,
|
||||
repeat_penalty: float = 1.3,
|
||||
init_prompt: str = "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
|
||||
@ -50,7 +49,6 @@ async def create_new_chat(
|
||||
max_tokens=max_length,
|
||||
n_ctx=context_window,
|
||||
n_gpu_layers=gpu_layers,
|
||||
n_gqa=gqa if gqa else None,
|
||||
last_n_tokens_size=repeat_last_n,
|
||||
repeat_penalty=repeat_penalty,
|
||||
n_threads=n_threads,
|
||||
@ -212,7 +210,6 @@ def stream_ask_a_question(chat_id: str, prompt: str):
|
||||
model_path="/usr/src/app/weights/" + chat.params.model_path + ".bin",
|
||||
n_ctx=len(chat.params.init_prompt) + chat.params.n_ctx,
|
||||
n_gpu_layers=chat.params.n_gpu_layers,
|
||||
n_gqa=chat.params.n_gqa if chat.params.n_gqa else None,
|
||||
n_threads=chat.params.n_threads,
|
||||
last_n_tokens_size=chat.params.last_n_tokens_size,
|
||||
)
|
||||
@ -284,7 +281,6 @@ async def ask_a_question(chat_id: str, prompt: str):
|
||||
n_ctx=len(chat.params.init_prompt) + chat.params.n_ctx,
|
||||
n_threads=chat.params.n_threads,
|
||||
n_gpu_layers=chat.params.n_gpu_layers,
|
||||
n_gqa=chat.params.n_gqa if chat.params.n_gqa else None,
|
||||
last_n_tokens_size=chat.params.last_n_tokens_size,
|
||||
)
|
||||
answer = client(
|
||||
|
||||
@ -29,9 +29,6 @@ class LlamaCpp(LLM):
|
||||
n_gpu_layers: int = Field(0, alias="n_gpu_layers")
|
||||
"""The number of layers to put on the GPU. The rest will be on the CPU."""
|
||||
|
||||
n_gqa: int | None = 0
|
||||
"""Grouped-query attention factor parameter. Set to 8 for LLaMA2"""
|
||||
|
||||
n_parts: int = Field(-1, alias="n_parts")
|
||||
"""Number of parts to split the model into.
|
||||
If -1, the number of parts is automatically determined."""
|
||||
@ -125,7 +122,6 @@ class LlamaCpp(LLM):
|
||||
"n_threads": self.n_threads,
|
||||
"n_ctx": self.n_ctx,
|
||||
"n_gpu_layers": self.n_gpu_layers,
|
||||
"n_gqa": self.n_gqa if self.n_gqa else None,
|
||||
"n_parts": self.n_parts,
|
||||
"seed": self.seed,
|
||||
"f16_kv": self.f16_kv,
|
||||
@ -171,7 +167,6 @@ class LlamaCpp(LLM):
|
||||
model_path="/usr/src/app/weights/" + self.model_path + ".bin",
|
||||
n_ctx=self.n_ctx,
|
||||
n_gpu_layers=self.n_gpu_layers,
|
||||
n_gqa=self.n_gqa if self.n_gqa else None,
|
||||
n_parts=self.n_parts,
|
||||
seed=self.seed,
|
||||
f16_kv=self.f16_kv,
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
|
||||
let temp = 0.1;
|
||||
let top_k = 50;
|
||||
let gqa = 0;
|
||||
let top_p = 0.95;
|
||||
|
||||
let max_length = 2048;
|
||||
@ -186,21 +185,6 @@
|
||||
class="range range-sm mt-auto"
|
||||
/>
|
||||
</div>
|
||||
<div
|
||||
class="tooltip flex flex-col"
|
||||
data-tip="Grouped-query attention factor parameter. Set to 8 for LLaMA2"
|
||||
>
|
||||
<label for="gqa" class="label-text pb-1">gqa</label>
|
||||
<input
|
||||
class="input-bordered input w-full max-w-xs"
|
||||
name="gqa"
|
||||
type="number"
|
||||
bind:value={gqa}
|
||||
min="0"
|
||||
max="8"
|
||||
step="1"
|
||||
/>
|
||||
</div>
|
||||
<div
|
||||
class="tooltip col-span-2"
|
||||
data-tip="Number of layers to put on the GPU. The rest will be on the CPU."
|
||||
|
||||
@ -123,8 +123,7 @@
|
||||
`/api/chat/?model=${data.chat.params.model_path}&temperature=${data.chat.params.temperature}&top_k=${data.chat.params.top_k}` +
|
||||
`&top_p=${data.chat.params.top_p}&max_length=${data.chat.params.max_tokens}&context_window=${data.chat.params.n_ctx}` +
|
||||
`&repeat_last_n=${data.chat.params.last_n_tokens_size}&repeat_penalty=${data.chat.params.repeat_penalty}` +
|
||||
`&n_threads=${data.chat.params.n_threads}&init_prompt=${data.chat.history[0].data.content}` +
|
||||
`&gpu_layers=${data.chat.params.n_gpu_layers}&n_gqa=${data.chat.params.n_gqa}`,
|
||||
`&n_threads=${data.chat.params.n_threads}&init_prompt=${data.chat.history[0].data.content}`,
|
||||
|
||||
{
|
||||
method: "POST",
|
||||
|
||||
@ -16,7 +16,6 @@ interface Params {
|
||||
n_ctx: number;
|
||||
n_gpu_layers: number;
|
||||
n_threads: number;
|
||||
n_gqa: number;
|
||||
last_n_tokens_size: number;
|
||||
max_tokens: number;
|
||||
temperature: number;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user