From 782ddf371702714e677368b68cd5db2ffd12ee5b Mon Sep 17 00:00:00 2001 From: Mariusz Kogen Date: Thu, 14 Sep 2023 13:45:46 +0200 Subject: [PATCH] Remove 'n_gqa' Related Code (#732) --- api/src/serge/models/chat.py | 1 - api/src/serge/routers/chat.py | 4 ---- api/src/serge/utils/llm.py | 5 ----- web/src/routes/+page.svelte | 16 ---------------- web/src/routes/chat/[id]/+page.svelte | 3 +-- web/src/routes/chat/[id]/+page.ts | 1 - 6 files changed, 1 insertion(+), 29 deletions(-) diff --git a/api/src/serge/models/chat.py b/api/src/serge/models/chat.py index d297f00..e5cc7b1 100644 --- a/api/src/serge/models/chat.py +++ b/api/src/serge/models/chat.py @@ -8,7 +8,6 @@ class ChatParameters(BaseModel): model_path: str n_ctx: int n_gpu_layers: int - n_gqa: int | None # n_parts: int # seed: int # f16_kv: bool diff --git a/api/src/serge/routers/chat.py b/api/src/serge/routers/chat.py index abdecd1..d026743 100644 --- a/api/src/serge/routers/chat.py +++ b/api/src/serge/routers/chat.py @@ -25,7 +25,6 @@ async def create_new_chat( max_length: int = 2048, context_window: int = 2048, gpu_layers: Optional[int] = None, - gqa: int | None = 0, repeat_last_n: int = 64, repeat_penalty: float = 1.3, init_prompt: str = "Below is an instruction that describes a task. Write a response that appropriately completes the request.", @@ -50,7 +49,6 @@ async def create_new_chat( max_tokens=max_length, n_ctx=context_window, n_gpu_layers=gpu_layers, - n_gqa=gqa if gqa else None, last_n_tokens_size=repeat_last_n, repeat_penalty=repeat_penalty, n_threads=n_threads, @@ -212,7 +210,6 @@ def stream_ask_a_question(chat_id: str, prompt: str): model_path="/usr/src/app/weights/" + chat.params.model_path + ".bin", n_ctx=len(chat.params.init_prompt) + chat.params.n_ctx, n_gpu_layers=chat.params.n_gpu_layers, - n_gqa=chat.params.n_gqa if chat.params.n_gqa else None, n_threads=chat.params.n_threads, last_n_tokens_size=chat.params.last_n_tokens_size, ) @@ -284,7 +281,6 @@ async def ask_a_question(chat_id: str, prompt: str): n_ctx=len(chat.params.init_prompt) + chat.params.n_ctx, n_threads=chat.params.n_threads, n_gpu_layers=chat.params.n_gpu_layers, - n_gqa=chat.params.n_gqa if chat.params.n_gqa else None, last_n_tokens_size=chat.params.last_n_tokens_size, ) answer = client( diff --git a/api/src/serge/utils/llm.py b/api/src/serge/utils/llm.py index cbc0c85..749b22f 100644 --- a/api/src/serge/utils/llm.py +++ b/api/src/serge/utils/llm.py @@ -29,9 +29,6 @@ class LlamaCpp(LLM): n_gpu_layers: int = Field(0, alias="n_gpu_layers") """The number of layers to put on the GPU. The rest will be on the CPU.""" - n_gqa: int | None = 0 - """Grouped-query attention factor parameter. Set to 8 for LLaMA2""" - n_parts: int = Field(-1, alias="n_parts") """Number of parts to split the model into. If -1, the number of parts is automatically determined.""" @@ -125,7 +122,6 @@ class LlamaCpp(LLM): "n_threads": self.n_threads, "n_ctx": self.n_ctx, "n_gpu_layers": self.n_gpu_layers, - "n_gqa": self.n_gqa if self.n_gqa else None, "n_parts": self.n_parts, "seed": self.seed, "f16_kv": self.f16_kv, @@ -171,7 +167,6 @@ class LlamaCpp(LLM): model_path="/usr/src/app/weights/" + self.model_path + ".bin", n_ctx=self.n_ctx, n_gpu_layers=self.n_gpu_layers, - n_gqa=self.n_gqa if self.n_gqa else None, n_parts=self.n_parts, seed=self.seed, f16_kv=self.f16_kv, diff --git a/web/src/routes/+page.svelte b/web/src/routes/+page.svelte index 5da5e19..8317586 100644 --- a/web/src/routes/+page.svelte +++ b/web/src/routes/+page.svelte @@ -14,7 +14,6 @@ let temp = 0.1; let top_k = 50; - let gqa = 0; let top_p = 0.95; let max_length = 2048; @@ -186,21 +185,6 @@ class="range range-sm mt-auto" /> -
- - -