added Tokens, openrouter, memory system
CI / check (push) Successful in 4m11s
CI / test (push) Successful in 3m57s
CI / clippy (push) Has been cancelled

This commit is contained in:
Sithies
2026-03-21 19:59:07 +01:00
parent 4e6b2c6759
commit 18b666f45d
41 changed files with 3217 additions and 258 deletions
-6
View File
@@ -1,6 +0,0 @@
/// Abstraktionsschicht für alle LLM-Provider.
/// Neue Provider (Ollama, Mistral) werden hier als weitere Submodule ergänzt.
pub mod lmstudio;
// Re-export aus nazarick-core damit bestehende Importe `api::llm::X` weiter funktionieren
pub use nazarick_core::llm::{LlmProvider, LlmRequest, LlmResponse, Message};
+28 -45
View File
@@ -1,3 +1,5 @@
// crates/api/src/llm/lmstudio.rs
use async_trait::async_trait;
use reqwest::Client;
use serde::{Deserialize, Serialize};
@@ -5,22 +7,13 @@ use nazarick_core::types::Result;
use nazarick_core::error::NazarickError;
use nazarick_core::llm::{LlmProvider, LlmRequest, LlmResponse, Message, SkillFormat};
/// LM Studio Provider — für lokale Entwicklung auf dem Entwicklungsrechner.
/// LM Studio emuliert die OpenAI Chat Completions API, daher nutzen
/// wir das OpenAI-kompatible Request/Response Format.
pub struct LmStudioProvider {
/// HTTP Client — wird wiederverwendet für Connection Pooling
client: Client,
/// Basis-URL von LM Studio, standard: http://localhost:1234
base_url: String,
/// Exakter Modell-Identifier wie er in LM Studio angezeigt wird
model: String,
}
impl LmStudioProvider {
/// Erstellt einen neuen LM Studio Provider.
/// `base_url` — z.B. "http://localhost:1234"
/// `model` — z.B. "qwen/qwen3.5-9b"
pub fn new(base_url: impl Into<String>, model: impl Into<String>) -> Self {
Self {
client: Client::new(),
@@ -29,8 +22,6 @@ impl LmStudioProvider {
}
}
/// Entfernt Qwen3 Thinking Mode Tags aus der Antwort.
/// Robuster Fallback falls "thinking: false" vom Modell ignoriert wird.
fn strip_thinking(response: &str) -> String {
let mut result = response.to_string();
while let Some(start) = result.find("<think>") {
@@ -46,76 +37,67 @@ impl LmStudioProvider {
}
}
/// Internes Message-Format — wird sowohl für Request (Serialize)
/// als auch für Response (Deserialize) verwendet.
/// Qwen3 nutzt reasoning_content statt content wenn Thinking Mode aktiv.
#[derive(Serialize, Deserialize)]
struct OpenAiMessage {
/// Nur für ausgehende Requests — kein reasoning_content
#[derive(Serialize)]
struct OpenAiRequestMessage {
role: String,
/// Normale Antwort — bei Qwen3 Thinking Mode leer
content: String,
}
/// Nur für eingehende Responses — reasoning_content optional
#[derive(Deserialize)]
struct OpenAiResponseMessage {
#[serde(default)]
content: String,
/// Qwen3 Thinking Mode — enthält die eigentliche Antwort wenn content leer
#[serde(default)]
reasoning_content: String,
}
/// Internes Request-Format — entspricht der OpenAI Chat Completions API.
#[derive(Serialize)]
struct OpenAiRequest {
model: String,
messages: Vec<OpenAiMessage>,
messages: Vec<OpenAiRequestMessage>,
max_tokens: u32,
temperature: f32,
/// Qwen3 Thinking Mode deaktivieren — funktioniert nicht bei allen
/// LM Studio Versionen, daher strippen wir zusätzlich im Response
thinking: bool,
#[serde(skip_serializing_if = "Option::is_none")]
thinking: Option<bool>,
}
/// Response-Format der OpenAI Chat Completions API.
#[derive(Deserialize)]
struct OpenAiResponse {
choices: Vec<OpenAiChoice>,
usage: Option<OpenAiUsage>,
}
/// Ein einzelner Antwort-Kandidat (LLMs können mehrere zurückgeben,
/// wir nutzen immer den ersten).
#[derive(Deserialize)]
struct OpenAiChoice {
message: OpenAiMessage,
message: OpenAiResponseMessage,
}
/// Token-Verbrauch wie von der API zurückgemeldet.
#[derive(Deserialize)]
struct OpenAiUsage {
prompt_tokens: u64,
completion_tokens: u64,
}
/// Konvertiert unsere internen Messages in das OpenAI Format.
/// reasoning_content wird beim Senden nicht mitgeschickt — nur role und content.
fn to_openai_message(msg: &Message) -> OpenAiMessage {
OpenAiMessage {
fn to_request_message(msg: &Message) -> OpenAiRequestMessage {
OpenAiRequestMessage {
role: msg.role.clone(),
content: msg.content.clone(),
reasoning_content: String::new(),
}
}
#[async_trait]
impl LlmProvider for LmStudioProvider {
async fn complete(&self, request: LlmRequest) -> Result<LlmResponse> {
// Request in OpenAI Format umwandeln
let openai_request = OpenAiRequest {
model: self.model.clone(),
messages: request.messages.iter().map(to_openai_message).collect(),
messages: request.messages.iter().map(to_request_message).collect(),
max_tokens: request.max_tokens,
temperature: request.temperature,
thinking: false,
thinking: None,
};
// HTTP POST an LM Studio senden
let response = self.client
.post(format!("{}/v1/chat/completions", self.base_url))
.json(&openai_request)
@@ -123,23 +105,27 @@ impl LlmProvider for LmStudioProvider {
.await
.map_err(|e| NazarickError::Api(e.to_string()))?;
// HTTP Fehler prüfen (z.B. 404, 500)
// Fehler-Details loggen
if !response.status().is_success() {
let status = response.status();
let body = response.text().await.unwrap_or_default();
return Err(NazarickError::Api(format!(
"HTTP {} — Body: {}", status, body
)));
}
let response = response
.error_for_status()
.map_err(|e| NazarickError::Api(e.to_string()))?;
// Rohe JSON-Antwort lesen — response wird dabei konsumiert
let raw_text = response
.text()
.await
.map_err(|e| NazarickError::Api(e.to_string()))?;
// JSON Response parsen
let openai_response: OpenAiResponse = serde_json::from_str(&raw_text)
.map_err(|e| NazarickError::Api(e.to_string()))?;
// Content extrahieren — Qwen3 Thinking Mode schreibt in reasoning_content
// statt content. Wir nehmen was befüllt ist, content hat Priorität.
let raw_content = openai_response.choices
.into_iter()
.next()
@@ -152,10 +138,8 @@ impl LlmProvider for LmStudioProvider {
})
.unwrap_or_default();
// Thinking Tags entfernen — Fallback falls thinking:false ignoriert wird
let content = Self::strip_thinking(&raw_content);
// Token-Zahlen aus Usage extrahieren (falls vorhanden)
let (tokens_input, tokens_output) = openai_response.usage
.map(|u| (u.prompt_tokens, u.completion_tokens))
.unwrap_or((0, 0));
@@ -167,7 +151,6 @@ impl LlmProvider for LmStudioProvider {
"LmStudio"
}
/// Lokale Modelle via LM Studio nutzen XML-Format für Skill-Calls.
fn skill_format(&self) -> SkillFormat {
SkillFormat::Xml
}
+2
View File
@@ -0,0 +1,2 @@
// crates/api/src/llm/mod.rs
pub mod openai_compat;
+214
View File
@@ -0,0 +1,214 @@
// crates/api/src/llm/openai_compat.rs
use async_trait::async_trait;
use reqwest::Client;
use serde::{Deserialize, Serialize};
use tracing::debug;
use nazarick_core::types::Result;
use nazarick_core::error::NazarickError;
use nazarick_core::llm::{LlmProvider, LlmRequest, LlmResponse, Message, SkillFormat, ToolCall};
pub struct OpenAiCompatProvider {
client: Client,
base_url: String,
model: String,
api_key: Option<String>,
skill_format: SkillFormat,
}
impl OpenAiCompatProvider {
pub fn new(
base_url: impl Into<String>,
model: impl Into<String>,
api_key: Option<String>,
skill_format: SkillFormat,
) -> Self {
Self {
client: Client::new(),
base_url: base_url.into(),
model: model.into(),
api_key,
skill_format,
}
}
fn strip_thinking(response: &str) -> String {
let mut result = response.to_string();
while let Some(start) = result.find("<think>") {
if let Some(end) = result.find("</think>") {
let tag = result[start..end + "</think>".len()].to_string();
result = result.replace(&tag, "");
} else {
result = result[..start].to_string();
break;
}
}
result.trim().to_string()
}
fn is_openrouter(&self) -> bool {
self.base_url.contains("openrouter.ai")
}
}
fn deserialize_null_as_empty<'de, D>(d: D) -> std::result::Result<String, D::Error>
where D: serde::Deserializer<'de> {
let opt = Option::<String>::deserialize(d)?;
Ok(opt.unwrap_or_default())
}
#[derive(Serialize)]
struct RequestMessage {
role: String,
content: String,
}
#[derive(Deserialize)]
struct ResponseMessage {
#[serde(default, deserialize_with = "deserialize_null_as_empty")]
content: String,
#[serde(default, deserialize_with = "deserialize_null_as_empty")]
reasoning_content: String,
#[serde(default)]
tool_calls: Option<Vec<ToolCall>>,
}
#[derive(Serialize)]
struct ChatRequest {
model: String,
messages: Vec<RequestMessage>,
max_tokens: u32,
temperature: f32,
#[serde(skip_serializing_if = "Option::is_none")]
tools: Option<Vec<serde_json::Value>>,
#[serde(skip_serializing_if = "Option::is_none")]
provider: Option<serde_json::Value>,
}
#[derive(Deserialize)]
struct ChatResponse {
choices: Vec<ChatChoice>,
usage: Option<ChatUsage>,
}
#[derive(Deserialize)]
struct ChatChoice {
message: ResponseMessage,
}
#[derive(Deserialize)]
struct ChatUsage {
prompt_tokens: u64,
completion_tokens: u64,
#[serde(default)]
cost: Option<f64>,
}
fn to_request_message(msg: &Message) -> RequestMessage {
RequestMessage {
role: msg.role.clone(),
content: msg.content.clone(),
}
}
#[async_trait]
impl LlmProvider for OpenAiCompatProvider {
async fn complete(&self, request: LlmRequest) -> Result<LlmResponse> {
let provider = if self.is_openrouter() {
Some(serde_json::json!({
"data_collection": "deny",
"zdr": true,
"require_parameters": true,
"allow_fallbacks": true
}))
} else {
None
};
let chat_request = ChatRequest {
model: self.model.clone(),
messages: request.messages.iter().map(to_request_message).collect(),
max_tokens: request.max_tokens,
temperature: request.temperature,
tools: request.tools.clone(),
provider,
};
if let Some(ref t) = request.tools {
debug!("Tools im Request: {}", t.len());
}
let mut req = self.client
.post(format!("{}/chat/completions", self.base_url))
.json(&chat_request);
if let Some(key) = &self.api_key {
req = req.header("Authorization", format!("Bearer {}", key));
}
if self.is_openrouter() {
req = req.header("HTTP-Referer", "https://github.com/nazarick");
req = req.header("X-Title", "Nazarick");
}
let response = req
.send()
.await
.map_err(|e| NazarickError::Api(e.to_string()))?;
if !response.status().is_success() {
let status = response.status();
let body = response.text().await.unwrap_or_default();
return Err(NazarickError::Api(format!(
"HTTP {} — Body: {}", status, body
)));
}
let raw_text = response
.text()
.await
.map_err(|e| NazarickError::Api(e.to_string()))?;
let chat_response: ChatResponse = serde_json::from_str(&raw_text)
.map_err(|e| NazarickError::Api(e.to_string()))?;
let tool_calls = chat_response.choices
.first()
.and_then(|c| c.message.tool_calls.clone());
let raw_content = chat_response.choices
.into_iter()
.next()
.map(|c| {
if !c.message.content.is_empty() {
c.message.content
} else {
c.message.reasoning_content
}
})
.unwrap_or_default();
let content = Self::strip_thinking(&raw_content);
debug!("Response content: {}", content);
debug!("Tool calls: {:?}", tool_calls);
let cost = chat_response.usage
.as_ref()
.and_then(|u| u.cost);
let (tokens_input, tokens_output) = chat_response.usage
.map(|u| (u.prompt_tokens, u.completion_tokens))
.unwrap_or((0, 0));
Ok(LlmResponse { content, tokens_input, tokens_output, tool_calls, cost })
}
fn name(&self) -> &str {
"OpenAiCompat"
}
fn skill_format(&self) -> SkillFormat {
self.skill_format.clone()
}
}