Remove 'n_gqa' Related Code (#732)

This commit is contained in:
Mariusz Kogen 2023-09-14 13:45:46 +02:00 committed by GitHub
parent e87d0209c8
commit 782ddf3717
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 1 additions and 29 deletions

View File

@ -8,7 +8,6 @@ class ChatParameters(BaseModel):
model_path: str
n_ctx: int
n_gpu_layers: int
n_gqa: int | None
# n_parts: int
# seed: int
# f16_kv: bool

View File

@ -25,7 +25,6 @@ async def create_new_chat(
max_length: int = 2048,
context_window: int = 2048,
gpu_layers: Optional[int] = None,
gqa: int | None = 0,
repeat_last_n: int = 64,
repeat_penalty: float = 1.3,
init_prompt: str = "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
@ -50,7 +49,6 @@ async def create_new_chat(
max_tokens=max_length,
n_ctx=context_window,
n_gpu_layers=gpu_layers,
n_gqa=gqa if gqa else None,
last_n_tokens_size=repeat_last_n,
repeat_penalty=repeat_penalty,
n_threads=n_threads,
@ -212,7 +210,6 @@ def stream_ask_a_question(chat_id: str, prompt: str):
model_path="/usr/src/app/weights/" + chat.params.model_path + ".bin",
n_ctx=len(chat.params.init_prompt) + chat.params.n_ctx,
n_gpu_layers=chat.params.n_gpu_layers,
n_gqa=chat.params.n_gqa if chat.params.n_gqa else None,
n_threads=chat.params.n_threads,
last_n_tokens_size=chat.params.last_n_tokens_size,
)
@ -284,7 +281,6 @@ async def ask_a_question(chat_id: str, prompt: str):
n_ctx=len(chat.params.init_prompt) + chat.params.n_ctx,
n_threads=chat.params.n_threads,
n_gpu_layers=chat.params.n_gpu_layers,
n_gqa=chat.params.n_gqa if chat.params.n_gqa else None,
last_n_tokens_size=chat.params.last_n_tokens_size,
)
answer = client(

View File

@ -29,9 +29,6 @@ class LlamaCpp(LLM):
n_gpu_layers: int = Field(0, alias="n_gpu_layers")
"""The number of layers to put on the GPU. The rest will be on the CPU."""
n_gqa: int | None = 0
"""Grouped-query attention factor parameter. Set to 8 for LLaMA2"""
n_parts: int = Field(-1, alias="n_parts")
"""Number of parts to split the model into.
If -1, the number of parts is automatically determined."""
@ -125,7 +122,6 @@ class LlamaCpp(LLM):
"n_threads": self.n_threads,
"n_ctx": self.n_ctx,
"n_gpu_layers": self.n_gpu_layers,
"n_gqa": self.n_gqa if self.n_gqa else None,
"n_parts": self.n_parts,
"seed": self.seed,
"f16_kv": self.f16_kv,
@ -171,7 +167,6 @@ class LlamaCpp(LLM):
model_path="/usr/src/app/weights/" + self.model_path + ".bin",
n_ctx=self.n_ctx,
n_gpu_layers=self.n_gpu_layers,
n_gqa=self.n_gqa if self.n_gqa else None,
n_parts=self.n_parts,
seed=self.seed,
f16_kv=self.f16_kv,

View File

@ -14,7 +14,6 @@
let temp = 0.1;
let top_k = 50;
let gqa = 0;
let top_p = 0.95;
let max_length = 2048;
@ -186,21 +185,6 @@
class="range range-sm mt-auto"
/>
</div>
<div
class="tooltip flex flex-col"
data-tip="Grouped-query attention factor parameter. Set to 8 for LLaMA2"
>
<label for="gqa" class="label-text pb-1">gqa</label>
<input
class="input-bordered input w-full max-w-xs"
name="gqa"
type="number"
bind:value={gqa}
min="0"
max="8"
step="1"
/>
</div>
<div
class="tooltip col-span-2"
data-tip="Number of layers to put on the GPU. The rest will be on the CPU."

View File

@ -123,8 +123,7 @@
`/api/chat/?model=${data.chat.params.model_path}&temperature=${data.chat.params.temperature}&top_k=${data.chat.params.top_k}` +
`&top_p=${data.chat.params.top_p}&max_length=${data.chat.params.max_tokens}&context_window=${data.chat.params.n_ctx}` +
`&repeat_last_n=${data.chat.params.last_n_tokens_size}&repeat_penalty=${data.chat.params.repeat_penalty}` +
`&n_threads=${data.chat.params.n_threads}&init_prompt=${data.chat.history[0].data.content}` +
`&gpu_layers=${data.chat.params.n_gpu_layers}&n_gqa=${data.chat.params.n_gqa}`,
`&n_threads=${data.chat.params.n_threads}&init_prompt=${data.chat.history[0].data.content}`,
{
method: "POST",

View File

@ -16,7 +16,6 @@ interface Params {
n_ctx: number;
n_gpu_layers: number;
n_threads: number;
n_gqa: number;
last_n_tokens_size: number;
max_tokens: number;
temperature: number;