added Tokens, openrouter, memory system
This commit is contained in:
@@ -1,6 +0,0 @@
|
||||
/// Abstraktionsschicht für alle LLM-Provider.
|
||||
/// Neue Provider (Ollama, Mistral) werden hier als weitere Submodule ergänzt.
|
||||
pub mod lmstudio;
|
||||
|
||||
// Re-export aus nazarick-core damit bestehende Importe `api::llm::X` weiter funktionieren
|
||||
pub use nazarick_core::llm::{LlmProvider, LlmRequest, LlmResponse, Message};
|
||||
@@ -1,3 +1,5 @@
|
||||
// crates/api/src/llm/lmstudio.rs
|
||||
|
||||
use async_trait::async_trait;
|
||||
use reqwest::Client;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -5,22 +7,13 @@ use nazarick_core::types::Result;
|
||||
use nazarick_core::error::NazarickError;
|
||||
use nazarick_core::llm::{LlmProvider, LlmRequest, LlmResponse, Message, SkillFormat};
|
||||
|
||||
/// LM Studio Provider — für lokale Entwicklung auf dem Entwicklungsrechner.
|
||||
/// LM Studio emuliert die OpenAI Chat Completions API, daher nutzen
|
||||
/// wir das OpenAI-kompatible Request/Response Format.
|
||||
pub struct LmStudioProvider {
|
||||
/// HTTP Client — wird wiederverwendet für Connection Pooling
|
||||
client: Client,
|
||||
/// Basis-URL von LM Studio, standard: http://localhost:1234
|
||||
base_url: String,
|
||||
/// Exakter Modell-Identifier wie er in LM Studio angezeigt wird
|
||||
model: String,
|
||||
}
|
||||
|
||||
impl LmStudioProvider {
|
||||
/// Erstellt einen neuen LM Studio Provider.
|
||||
/// `base_url` — z.B. "http://localhost:1234"
|
||||
/// `model` — z.B. "qwen/qwen3.5-9b"
|
||||
pub fn new(base_url: impl Into<String>, model: impl Into<String>) -> Self {
|
||||
Self {
|
||||
client: Client::new(),
|
||||
@@ -29,8 +22,6 @@ impl LmStudioProvider {
|
||||
}
|
||||
}
|
||||
|
||||
/// Entfernt Qwen3 Thinking Mode Tags aus der Antwort.
|
||||
/// Robuster Fallback falls "thinking: false" vom Modell ignoriert wird.
|
||||
fn strip_thinking(response: &str) -> String {
|
||||
let mut result = response.to_string();
|
||||
while let Some(start) = result.find("<think>") {
|
||||
@@ -46,76 +37,67 @@ impl LmStudioProvider {
|
||||
}
|
||||
}
|
||||
|
||||
/// Internes Message-Format — wird sowohl für Request (Serialize)
|
||||
/// als auch für Response (Deserialize) verwendet.
|
||||
/// Qwen3 nutzt reasoning_content statt content wenn Thinking Mode aktiv.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct OpenAiMessage {
|
||||
/// Nur für ausgehende Requests — kein reasoning_content
|
||||
#[derive(Serialize)]
|
||||
struct OpenAiRequestMessage {
|
||||
role: String,
|
||||
/// Normale Antwort — bei Qwen3 Thinking Mode leer
|
||||
content: String,
|
||||
}
|
||||
|
||||
/// Nur für eingehende Responses — reasoning_content optional
|
||||
#[derive(Deserialize)]
|
||||
struct OpenAiResponseMessage {
|
||||
#[serde(default)]
|
||||
content: String,
|
||||
/// Qwen3 Thinking Mode — enthält die eigentliche Antwort wenn content leer
|
||||
#[serde(default)]
|
||||
reasoning_content: String,
|
||||
}
|
||||
|
||||
/// Internes Request-Format — entspricht der OpenAI Chat Completions API.
|
||||
#[derive(Serialize)]
|
||||
struct OpenAiRequest {
|
||||
model: String,
|
||||
messages: Vec<OpenAiMessage>,
|
||||
messages: Vec<OpenAiRequestMessage>,
|
||||
max_tokens: u32,
|
||||
temperature: f32,
|
||||
/// Qwen3 Thinking Mode deaktivieren — funktioniert nicht bei allen
|
||||
/// LM Studio Versionen, daher strippen wir zusätzlich im Response
|
||||
thinking: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
thinking: Option<bool>,
|
||||
}
|
||||
|
||||
/// Response-Format der OpenAI Chat Completions API.
|
||||
#[derive(Deserialize)]
|
||||
struct OpenAiResponse {
|
||||
choices: Vec<OpenAiChoice>,
|
||||
usage: Option<OpenAiUsage>,
|
||||
}
|
||||
|
||||
/// Ein einzelner Antwort-Kandidat (LLMs können mehrere zurückgeben,
|
||||
/// wir nutzen immer den ersten).
|
||||
#[derive(Deserialize)]
|
||||
struct OpenAiChoice {
|
||||
message: OpenAiMessage,
|
||||
message: OpenAiResponseMessage,
|
||||
}
|
||||
|
||||
/// Token-Verbrauch wie von der API zurückgemeldet.
|
||||
#[derive(Deserialize)]
|
||||
struct OpenAiUsage {
|
||||
prompt_tokens: u64,
|
||||
completion_tokens: u64,
|
||||
}
|
||||
|
||||
/// Konvertiert unsere internen Messages in das OpenAI Format.
|
||||
/// reasoning_content wird beim Senden nicht mitgeschickt — nur role und content.
|
||||
fn to_openai_message(msg: &Message) -> OpenAiMessage {
|
||||
OpenAiMessage {
|
||||
fn to_request_message(msg: &Message) -> OpenAiRequestMessage {
|
||||
OpenAiRequestMessage {
|
||||
role: msg.role.clone(),
|
||||
content: msg.content.clone(),
|
||||
reasoning_content: String::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl LlmProvider for LmStudioProvider {
|
||||
async fn complete(&self, request: LlmRequest) -> Result<LlmResponse> {
|
||||
// Request in OpenAI Format umwandeln
|
||||
let openai_request = OpenAiRequest {
|
||||
model: self.model.clone(),
|
||||
messages: request.messages.iter().map(to_openai_message).collect(),
|
||||
messages: request.messages.iter().map(to_request_message).collect(),
|
||||
max_tokens: request.max_tokens,
|
||||
temperature: request.temperature,
|
||||
thinking: false,
|
||||
thinking: None,
|
||||
};
|
||||
|
||||
// HTTP POST an LM Studio senden
|
||||
let response = self.client
|
||||
.post(format!("{}/v1/chat/completions", self.base_url))
|
||||
.json(&openai_request)
|
||||
@@ -123,23 +105,27 @@ impl LlmProvider for LmStudioProvider {
|
||||
.await
|
||||
.map_err(|e| NazarickError::Api(e.to_string()))?;
|
||||
|
||||
// HTTP Fehler prüfen (z.B. 404, 500)
|
||||
// Fehler-Details loggen
|
||||
if !response.status().is_success() {
|
||||
let status = response.status();
|
||||
let body = response.text().await.unwrap_or_default();
|
||||
return Err(NazarickError::Api(format!(
|
||||
"HTTP {} — Body: {}", status, body
|
||||
)));
|
||||
}
|
||||
|
||||
let response = response
|
||||
.error_for_status()
|
||||
.map_err(|e| NazarickError::Api(e.to_string()))?;
|
||||
|
||||
// Rohe JSON-Antwort lesen — response wird dabei konsumiert
|
||||
let raw_text = response
|
||||
.text()
|
||||
.await
|
||||
.map_err(|e| NazarickError::Api(e.to_string()))?;
|
||||
|
||||
// JSON Response parsen
|
||||
let openai_response: OpenAiResponse = serde_json::from_str(&raw_text)
|
||||
.map_err(|e| NazarickError::Api(e.to_string()))?;
|
||||
|
||||
// Content extrahieren — Qwen3 Thinking Mode schreibt in reasoning_content
|
||||
// statt content. Wir nehmen was befüllt ist, content hat Priorität.
|
||||
let raw_content = openai_response.choices
|
||||
.into_iter()
|
||||
.next()
|
||||
@@ -152,10 +138,8 @@ impl LlmProvider for LmStudioProvider {
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
// Thinking Tags entfernen — Fallback falls thinking:false ignoriert wird
|
||||
let content = Self::strip_thinking(&raw_content);
|
||||
|
||||
// Token-Zahlen aus Usage extrahieren (falls vorhanden)
|
||||
let (tokens_input, tokens_output) = openai_response.usage
|
||||
.map(|u| (u.prompt_tokens, u.completion_tokens))
|
||||
.unwrap_or((0, 0));
|
||||
@@ -167,7 +151,6 @@ impl LlmProvider for LmStudioProvider {
|
||||
"LmStudio"
|
||||
}
|
||||
|
||||
/// Lokale Modelle via LM Studio nutzen XML-Format für Skill-Calls.
|
||||
fn skill_format(&self) -> SkillFormat {
|
||||
SkillFormat::Xml
|
||||
}
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
// crates/api/src/llm/mod.rs
|
||||
pub mod openai_compat;
|
||||
@@ -0,0 +1,214 @@
|
||||
// crates/api/src/llm/openai_compat.rs
|
||||
|
||||
use async_trait::async_trait;
|
||||
use reqwest::Client;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::debug;
|
||||
use nazarick_core::types::Result;
|
||||
use nazarick_core::error::NazarickError;
|
||||
use nazarick_core::llm::{LlmProvider, LlmRequest, LlmResponse, Message, SkillFormat, ToolCall};
|
||||
|
||||
pub struct OpenAiCompatProvider {
|
||||
client: Client,
|
||||
base_url: String,
|
||||
model: String,
|
||||
api_key: Option<String>,
|
||||
skill_format: SkillFormat,
|
||||
}
|
||||
|
||||
impl OpenAiCompatProvider {
|
||||
pub fn new(
|
||||
base_url: impl Into<String>,
|
||||
model: impl Into<String>,
|
||||
api_key: Option<String>,
|
||||
skill_format: SkillFormat,
|
||||
) -> Self {
|
||||
Self {
|
||||
client: Client::new(),
|
||||
base_url: base_url.into(),
|
||||
model: model.into(),
|
||||
api_key,
|
||||
skill_format,
|
||||
}
|
||||
}
|
||||
|
||||
fn strip_thinking(response: &str) -> String {
|
||||
let mut result = response.to_string();
|
||||
while let Some(start) = result.find("<think>") {
|
||||
if let Some(end) = result.find("</think>") {
|
||||
let tag = result[start..end + "</think>".len()].to_string();
|
||||
result = result.replace(&tag, "");
|
||||
} else {
|
||||
result = result[..start].to_string();
|
||||
break;
|
||||
}
|
||||
}
|
||||
result.trim().to_string()
|
||||
}
|
||||
|
||||
fn is_openrouter(&self) -> bool {
|
||||
self.base_url.contains("openrouter.ai")
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize_null_as_empty<'de, D>(d: D) -> std::result::Result<String, D::Error>
|
||||
where D: serde::Deserializer<'de> {
|
||||
let opt = Option::<String>::deserialize(d)?;
|
||||
Ok(opt.unwrap_or_default())
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct RequestMessage {
|
||||
role: String,
|
||||
content: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ResponseMessage {
|
||||
#[serde(default, deserialize_with = "deserialize_null_as_empty")]
|
||||
content: String,
|
||||
#[serde(default, deserialize_with = "deserialize_null_as_empty")]
|
||||
reasoning_content: String,
|
||||
#[serde(default)]
|
||||
tool_calls: Option<Vec<ToolCall>>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct ChatRequest {
|
||||
model: String,
|
||||
messages: Vec<RequestMessage>,
|
||||
max_tokens: u32,
|
||||
temperature: f32,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
tools: Option<Vec<serde_json::Value>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
provider: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ChatResponse {
|
||||
choices: Vec<ChatChoice>,
|
||||
usage: Option<ChatUsage>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ChatChoice {
|
||||
message: ResponseMessage,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ChatUsage {
|
||||
prompt_tokens: u64,
|
||||
completion_tokens: u64,
|
||||
#[serde(default)]
|
||||
cost: Option<f64>,
|
||||
}
|
||||
|
||||
fn to_request_message(msg: &Message) -> RequestMessage {
|
||||
RequestMessage {
|
||||
role: msg.role.clone(),
|
||||
content: msg.content.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl LlmProvider for OpenAiCompatProvider {
|
||||
async fn complete(&self, request: LlmRequest) -> Result<LlmResponse> {
|
||||
let provider = if self.is_openrouter() {
|
||||
Some(serde_json::json!({
|
||||
"data_collection": "deny",
|
||||
"zdr": true,
|
||||
"require_parameters": true,
|
||||
"allow_fallbacks": true
|
||||
}))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let chat_request = ChatRequest {
|
||||
model: self.model.clone(),
|
||||
messages: request.messages.iter().map(to_request_message).collect(),
|
||||
max_tokens: request.max_tokens,
|
||||
temperature: request.temperature,
|
||||
tools: request.tools.clone(),
|
||||
provider,
|
||||
};
|
||||
|
||||
if let Some(ref t) = request.tools {
|
||||
debug!("Tools im Request: {}", t.len());
|
||||
}
|
||||
|
||||
let mut req = self.client
|
||||
.post(format!("{}/chat/completions", self.base_url))
|
||||
.json(&chat_request);
|
||||
|
||||
if let Some(key) = &self.api_key {
|
||||
req = req.header("Authorization", format!("Bearer {}", key));
|
||||
}
|
||||
|
||||
if self.is_openrouter() {
|
||||
req = req.header("HTTP-Referer", "https://github.com/nazarick");
|
||||
req = req.header("X-Title", "Nazarick");
|
||||
}
|
||||
|
||||
let response = req
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| NazarickError::Api(e.to_string()))?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
let status = response.status();
|
||||
let body = response.text().await.unwrap_or_default();
|
||||
return Err(NazarickError::Api(format!(
|
||||
"HTTP {} — Body: {}", status, body
|
||||
)));
|
||||
}
|
||||
|
||||
let raw_text = response
|
||||
.text()
|
||||
.await
|
||||
.map_err(|e| NazarickError::Api(e.to_string()))?;
|
||||
|
||||
let chat_response: ChatResponse = serde_json::from_str(&raw_text)
|
||||
.map_err(|e| NazarickError::Api(e.to_string()))?;
|
||||
|
||||
let tool_calls = chat_response.choices
|
||||
.first()
|
||||
.and_then(|c| c.message.tool_calls.clone());
|
||||
|
||||
let raw_content = chat_response.choices
|
||||
.into_iter()
|
||||
.next()
|
||||
.map(|c| {
|
||||
if !c.message.content.is_empty() {
|
||||
c.message.content
|
||||
} else {
|
||||
c.message.reasoning_content
|
||||
}
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
let content = Self::strip_thinking(&raw_content);
|
||||
|
||||
debug!("Response content: {}", content);
|
||||
debug!("Tool calls: {:?}", tool_calls);
|
||||
|
||||
let cost = chat_response.usage
|
||||
.as_ref()
|
||||
.and_then(|u| u.cost);
|
||||
|
||||
let (tokens_input, tokens_output) = chat_response.usage
|
||||
.map(|u| (u.prompt_tokens, u.completion_tokens))
|
||||
.unwrap_or((0, 0));
|
||||
|
||||
Ok(LlmResponse { content, tokens_input, tokens_output, tool_calls, cost })
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"OpenAiCompat"
|
||||
}
|
||||
|
||||
fn skill_format(&self) -> SkillFormat {
|
||||
self.skill_format.clone()
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user