Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Correct context length for models #2867

Merged
merged 19 commits into from
May 6, 2024
Merged
2 changes: 1 addition & 1 deletion extensions/inference-nitro-extension/package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "@janhq/inference-nitro-extension",
"productName": "Nitro Inference Engine",
"version": "1.0.4",
"version": "1.0.5",
"description": "This extension embeds Nitro, a lightweight (3mb) inference engine written in C++. See https://nitro.jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
"main": "dist/index.js",
"node": "dist/node/index.cjs.js",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
"id": "codeninja-1.0-7b",
"object": "model",
"name": "CodeNinja 7B Q4",
"version": "1.0",
"version": "1.1",
"description": "CodeNinja is good for coding tasks and can handle various languages including Python, C, C++, Rust, Java, JavaScript, and more.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 8192,
"prompt_template": "GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:",
"llama_model_path": "codeninja-1.0-openchat-7b.Q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 8192,
"frequency_penalty": 0,
"presence_penalty": 0
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
"id": "command-r-34b",
"object": "model",
"name": "Command-R v01 34B Q4",
"version": "1.3",
"version": "1.4",
"description": "C4AI Command-R developed by CohereAI is optimized for a variety of use cases including reasoning, summarization, and question answering.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 131072,
"prompt_template": "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
"llama_model_path": "c4ai-command-r-v01-Q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 131072,
"stop": [],
"frequency_penalty": 0,
"presence_penalty": 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
"id": "deepseek-coder-1.3b",
"object": "model",
"name": "Deepseek Coder 1.3B Q8",
"version": "1.0",
"version": "1.1",
"description": "Deepseek Coder excelled in project-level code completion with advanced capabilities across multiple programming languages.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 16384,
"prompt_template": "### Instruction:\n{prompt}\n### Response:",
"llama_model_path": "deepseek-coder-1.3b-instruct.Q8_0.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 16384,
"stop": [],
"frequency_penalty": 0,
"presence_penalty": 0
Expand Down
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
{
"sources": [
{
"filename": "deepseek-coder-33b-instruct.Q5_K_M.gguf",
"url": "https://huggingface.co/TheBloke/deepseek-coder-33B-instruct-GGUF/resolve/main/deepseek-coder-33b-instruct.Q5_K_M.gguf"
"filename": "deepseek-coder-33b-instruct.Q4_K_M.gguf",
"url": "https://huggingface.co/TheBloke/deepseek-coder-33B-instruct-GGUF/resolve/main/deepseek-coder-33b-instruct.Q4_K_M.gguf"
}
],
"id": "deepseek-coder-34b",
"object": "model",
"name": "Deepseek Coder 33B Q5",
"version": "1.0",
"name": "Deepseek Coder 33B Q4",
"version": "1.1",
"description": "Deepseek Coder excelled in project-level code completion with advanced capabilities across multiple programming languages.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 16384,
"prompt_template": "### Instruction:\n{prompt}\n### Response:",
"llama_model_path": "deepseek-coder-33b-instruct.Q5_K_M.gguf"
"llama_model_path": "deepseek-coder-33b-instruct.Q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 16384,
"stop": [],
"frequency_penalty": 0,
"presence_penalty": 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
"id": "gemma-2b",
"object": "model",
"name": "Gemma 2B Q4",
"version": "1.0",
"version": "1.1",
"description": "Gemma is built from the same technology with Google's Gemini.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 8192,
"prompt_template": "<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model",
"llama_model_path": "gemma-2b-it-q4_k_m.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 8192,
"stop": [],
"frequency_penalty": 0,
"presence_penalty": 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
"id": "gemma-7b",
"object": "model",
"name": "Gemma 7B Q4",
"version": "1.0",
"version": "1.1",
"description": "Google's Gemma is built for multilingual purpose",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 8192,
"prompt_template": "<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model",
"llama_model_path": "gemma-7b-it-q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 8192,
"stop": [],
"frequency_penalty": 0,
"presence_penalty": 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,20 @@
"id": "mistral-ins-7b-q4",
"object": "model",
"name": "Mistral Instruct 7B Q4",
"version": "1.0",
"version": "1.1",
"description": "Mistral Instruct 7b model, specifically designed for a comprehensive understanding of the world.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 32768,
"prompt_template": "[INST] {prompt} [/INST]",
"llama_model_path": "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"stop": [],
"max_tokens": 32768,
"stop": ["[/INST]"],
"frequency_penalty": 0,
"presence_penalty": 0
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
"id": "mixtral-8x7b-instruct",
"object": "model",
"name": "Mixtral 8x7B Instruct Q4",
"version": "1.0",
"version": "1.1",
"description": "The Mixtral-8x7B is a pretrained generative Sparse Mixture of Experts. The Mixtral-8x7B outperforms 70B models on most benchmarks.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 32768,
"prompt_template": "[INST] {prompt} [/INST]",
"llama_model_path": "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 32768,
"frequency_penalty": 0,
"presence_penalty": 0
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
"id": "noromaid-7b",
"object": "model",
"name": "Noromaid 7B Q4",
"version": "1.0",
"version": "1.1",
"description": "The Noromaid 7b model is designed for role-playing with human-like behavior.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 32768,
"prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
"llama_model_path": "Noromaid-7B-0.4-DPO.q4_k_m.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 32768,
"stop": [],
"frequency_penalty": 0,
"presence_penalty": 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
"id": "openchat-3.5-7b",
"object": "model",
"name": "Openchat-3.5 7B Q4",
"version": "1.0",
"version": "1.1",
"description": "The performance of Openchat surpasses ChatGPT-3.5 and Grok-1 across various benchmarks.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 8192,
"prompt_template": "GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:",
"llama_model_path": "openchat-3.5-0106.Q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 8192,
"stop": ["<|end_of_turn|>"],
"frequency_penalty": 0,
"presence_penalty": 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
"id": "phind-34b",
"object": "model",
"name": "Phind 34B Q4",
"version": "1.1",
"version": "1.2",
"description": "Phind 34B is the best Open-source coding model.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 16384,
"prompt_template": "### System Prompt\n{system_message}\n### User Message\n{prompt}\n### Assistant",
"llama_model_path": "phind-codellama-34b-v2.Q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 16384,
"stop": [],
"frequency_penalty": 0,
"presence_penalty": 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
"id": "qwen-7b",
"object": "model",
"name": "Qwen Chat 7B Q4",
"version": "1.0",
"version": "1.1",
"description": "Qwen is optimized at Chinese, ideal for everyday tasks.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 32768,
"prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
"llama_model_path": "qwen1_5-7b-chat-q4_k_m.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 32768,
"stop": [],
"frequency_penalty": 0,
"presence_penalty": 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
"description": "This is a new experimental family designed to enhance Mathematical and Logical abilities.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 32768,
"prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
"llama_model_path": "stealth-v1.3.Q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 32768,
"frequency_penalty": 0,
"presence_penalty": 0
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
"description": "Trinity is an experimental model merge using the Slerp method. Recommended for daily assistance purposes.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 32768,
"prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
"llama_model_path": "trinity-v1.2.Q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 32768,
"frequency_penalty": 0,
"presence_penalty": 0
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
"id": "vistral-7b",
"object": "model",
"name": "Vistral 7B Q4",
"version": "1.0",
"version": "1.1",
"description": "Vistral 7B has a deep understanding of Vietnamese.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 32768,
"prompt_template": "[INST] <<SYS>>\n{system_message}\n<</SYS>>\n{prompt} [/INST]",
"llama_model_path": "vistral-7b-chat-dpo.Q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 32768,
"stop": [],
"frequency_penalty": 0,
"presence_penalty": 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
"description": "WizardCoder 13B is a Python coding model. This model demonstrate high proficiency in specific domains like coding and mathematics.",
"format": "gguf",
"settings": {
"ctx_len": 4096,
"ctx_len": 16384,
"prompt_template": "### Instruction:\n{prompt}\n### Response:",
"llama_model_path": "wizardcoder-python-13b-v1.0.Q4_K_M.gguf"
},
"parameters": {
"temperature": 0.7,
"top_p": 0.95,
"stream": true,
"max_tokens": 4096,
"max_tokens": 16384,
"stop": [],
"frequency_penalty": 0,
"presence_penalty": 0
Expand Down