diff --git a/README.md b/README.md index b6d7da7..2df88c1 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,185 @@ -# ML +# ML Repo — Architecture and External RAG Server Design (for Ollama/Open WebUI) -My openWebUI/searxng configs, plugins, RAG server, as well as a custom program that runs the AI's code in isolated Docker containers \ No newline at end of file +My openWebUI/searxng configs, plugins, RAG server, as well as a custom program that runs the AI's code in isolated Docker containers + +*Last updated: 2025-09-10* + +--- + +## Summary :3 + +This repository wires together a local AI stack built around **Open WebUI**, **Ollama**, **SearxNG**, and two custom utilities: a **code runner** (executes model-generated code inside sandboxed containers) and a **headless research browser UI**. The current compose setup already gives you working RAG (retrieval-augmented generation) **inside Open WebUI** without needing a separate RAG service. + +--- + +## Repo map and how each piece fits + +```sh +. +├─ docker-compose.yml +├─ searxng.yml # searxng settings; defaults, json+html enabled; not a public instance +├─ cloudflared-tunnel-config.yml # cloudflare tunnel routing to ollama, openwebui, and tools +├─ README.md +├─ LICENSE # apache-2.0 +│ +├─ rag-server/ +│ ├─ Dockerfile # Runs the file that does the RAG stuff +│ └─ index.tsx # Does the RAG stuff +│ +├─ browser/ +│ └─ Dockerfile # builds browser-use/web-ui (playwright chromium) on :7788 +| +└─ coderunner/ + ├─ Dockerfile # bun-based service that exposes an OpenAPI tool for sandboxed code exec + ├─ index.ts # the server; integrates with Open WebUI as a tool via /openapi.json + └─ package.json # @types/node only (dev) to feed the OCD +``` + +### Open WebUI (in `docker-compose.yml`) + +* purpose: chat UI + orchestration layer; **includes a built-in knowledge base + RAG** with chunking, embedding, search, and prompt templating. +* notable: backed by Postgres in this compose. exposes `4000:8080`. +* storage: a docker volume `open-webui:` holds app data; Postgres uses `pgdata:`. + +### Postgres (in `docker-compose.yml`) + +* purpose: persistence for Open WebUI features (users, knowledge, etc.). health-checked with `pg_isready`. + +### SearxNG (in `docker-compose.yml` + `searxng.yml`) + +* purpose: metasearch engine used by Open WebUI tools/agents for live web lookups. +* config highlights: `use_default_settings: true`, `public_instance: false`, `limiter: false`; formats: `html` and `json`. + +### Coderunner service (`coderunner/`) + +* **what it is:** a small HTTP server (Bun runtime) that executes pure source code in short-lived, sandboxed containers. +* **why it exists:** lets Open WebUI tools run code safely with tight resource limits (no network, read-only fs, cgroup limits, `--cap-drop=ALL`, `no-new-privileges`). +* **integration contract:** exposes an **OpenAPI schema at `/openapi.json`** and a single POST `/execute` endpoint. Open WebUI can import this as a **tool server**. +* **security posture:** pulls allow-listed base images (gcc, python, node, bun, etc.), mounts only a tmpfs workdir, times out jobs ≈25s, and runs with non-root uid/gid. the container has access to the host’s docker socket *only* to run the sandbox containers. + +### Browser-use web-ui (`browser/`) + +* purpose: “autonomous” research browser UI (chromium via playwright), reachable on `:7788`. +* built from upstream `browser-use/web-ui` repo, with python deps and browsers installed in the image. + +### Cloudflared tunnel (`cloudflared-tunnel-config.yml`) + +* maps hostnames (like `mlep.domain.com` for Ollama, `owebui.domain.com` for Open WebUI, and a `tools` host) to the internal services. useful for private, authenticated access without public inbound ports. + +--- + +## Why you currently **don’t** need an external RAG server + +Open WebUI ships with first-class **knowledge / RAG** support: add files/URLs, it chunks + embeds, indexes, retrieves, and automatically **prefixes retrieved context** to the model prompt using a RAG template. for lightweight to mid-sized corpora and single-user/small-team usage, that’s often all you need. + +**Stay with built-in RAG if most of these are true:** + +* total corpus is ≤ \~100k chunks and grows slowly. +* single user or small team (no multi-tenant isolation needed). +* no special retrieval logic (hybrid lexical+semantic, rerankers, metadata filters) beyond what Open WebUI provides. +* tolerance for “UI-managed” knowledge; you don’t need programmatic ingestion pipelines or job queues. + +## When an external RAG server makes sense + +Adopt a decoupled RAG service when you need one or more of: + +* **bigger data / throughput**: millions of chunks, higher QPS, horizontal scaling. +* **advanced retrieval**: custom chunkers, hybrid search (bm25 + vector), **reranking**, time-decay, per-tenant filters, embeddings A/B, or multi-modal (image/audio) retrieval. +* **programmatic ingestion**: CI-driven pipelines from git/docs/confluence/S3; delta updates; background jobs. +* **governance / isolation**: strict multi-tenant separation, PII retention controls, audit trails. +* **interoperability**: a clean HTTP API and OpenAPI so other apps (beyond Open WebUI) can reuse your index. + +--- + +## External RAG Server — Design and Reference Implementation + +This is a small, dependency-light service designed to run with **Bun** and integrate with both **Ollama** and **Open WebUI**. + +### Goals + +* minimal moving parts; runs fine on a single host. +* uses Ollama for **embeddings** and **chat**. +* supports **collections**, **upserts**, **queries**, and an opinionated `/chat` that does retrieve-then-generate. +* ships an **OpenAPI** so Open WebUI can import it as a tool server. +* default in-memory store (persisted to JSON) for simplicity; optional adapters for vector DBs later. + +### API surface + +* `GET /openapi.json` – schema for tool integration. +* `POST /collections` – create a logical collection `{ name }`. +* `GET /collections` – list collections. +* `POST /upsert` – `{ collection, items:[{ id?, text, metadata? }] }`; chunks+embeds text and stores vectors. +* `POST /query` – `{ collection, query, topK?=5, where? }` --> nearest chunks with scores. +* `POST /chat` – `{ collection, query, topK?=5, model?, embedModel? }` --> runs RAG and calls Ollama chat, returns the answer + citations. + +### Storage Strategy + +* **default:** in-memory + JSON file on disk (`./data/rag.json`). good for dev/small usage. +* **plug-in adapters:** swap in Qdrant, SQLite-Vec, pgvector, Weaviate, etc., without changing the HTTP API. + +--- + +### Add to `docker-compose.yml` + +```yaml + rag: + build: + context: ./rag-server + dockerfile: Dockerfile + environment: + OLLAMA_BASE: "http://mlep.domain.com:11434" + OLLAMA_CHAT_MODEL: "llama3.1" + OLLAMA_EMBED_MODEL: "nomic-embed-text" + volumes: + - rag_data:/app/data + networks: + - internal + restart: unless-stopped + +volumes: + rag_data: +``` + +> if you already expose services via cloudflared, add another hostname mapping to the `rag` container (`- hostname: rag.domain.com -> service: http://rag:8788`). + +--- + +## Wiring the RAG server into Open WebUI and Ollama + +### 1. Pull models + +* `ollama pull nomic-embed-text` (embeddings) +* `ollama pull llama3.1` (chat) + +### 2. Expose the OpenAPI to Open WebUI as a **tool server** + +* in Open WebUI --> **settings --> tools** --> **add tool server** +* paste the url for the cloudflared hostname +* you’ll now see tool functions like `listCollections`, `createCollection`, `upsert`, `query`, `chat` available to the assistant + +### 3. Usage pattern inside a chat + +* to build a knowledge base, call the `createCollection` and `upsert` tools with your documents +* to answer, call `chat` which performs retrieve-then-generate against your chosen collection + +--- + +## FAQ — Built-in vs. External RAG + +**Q: will Open WebUI’s built-in RAG conflict with this server?** +no — you can use either, or both. Open WebUI’s knowledge base is great for ad-hoc use. this service is for programmatic/control-plane needs or when you outgrow the UI’s storage/retrieval. + +**Q: how do enforce tenant isolation?** +use one collection per tenant and never mix. for stronger guarantees, run separate RAG instances or choose Qdrant with per-collection access control. + +**Q: how can use my chunker/reranker?** +yes. place them ahead of `/upsert` and `/query` respectively, or add endpoints like `/rerank` and `/embed` to experiment. + +**Q: can this call OpenAI-compatible endpoints instead of native Ollama?** +Ollama exposes an experimental OpenAI-compatible API. you can add a thin client if you already point tools at `/v1/chat/completions`. + +--- + +## License + +This write-up and reference code are provided under the same **Apache-2.0** terms as the repository. diff --git a/rag-server/Dockerfile b/rag-server/Dockerfile new file mode 100644 index 0000000..4aaf98c --- /dev/null +++ b/rag-server/Dockerfile @@ -0,0 +1,12 @@ +# syntax=docker/dockerfile:1 +FROM oven/bun:1.2.2-alpine + +WORKDIR /app + +COPY index.ts ./index.ts + +ENV PORT=8788 + +EXPOSE 8788 + +CMD ["bun","run","index.ts"] \ No newline at end of file diff --git a/rag-server/index.ts b/rag-server/index.ts new file mode 100644 index 0000000..64ce2be --- /dev/null +++ b/rag-server/index.ts @@ -0,0 +1,289 @@ +import { serve } from "bun"; +import fs from "node:fs"; +import path from "node:path"; + +// types +interface Chunk { + id: string; + text: string; + metadata?: Record; + vector: number[]; +} + +interface Collection { + name: string; + chunks: Chunk[]; +} + +interface OllamaChatMessage { + role: "system" | "user" | "assistant"; + content: string; +} + +interface OllamaChatRequest { + model?: string; + messages: OllamaChatMessage[]; + stream?: boolean; +} + +interface OllamaChatResponse { + message?: OllamaChatMessage; + [k: string]: unknown; +} + +interface UpsertInputItem { + text: string; + metadata?: Record; +} + +interface OpenAPIObject { + openapi: string; + info: { title: string; version: string }; + paths: Record; +} + +// env +const PORT: number = Number(process.env.PORT || 8788), + HOST: string = process.env.HOST || "0.0.0.0", + OLLAMA_BASE: string = process.env.OLLAMA_BASE || "http://localhost:11434", + OLLAMA_CHAT_MODEL: string = process.env.OLLAMA_CHAT_MODEL || "llama3.1", + OLLAMA_EMBED_MODEL: string = process.env.OLLAMA_EMBED_MODEL || "nomic-embed-text", + DATA_DIR: string = process.env.DATA_DIR || path.resolve("./data"), + SNAPSHOT: string = path.join(DATA_DIR, "rag.json"); + +// in-memory db +const db: Map = new Map(); + +// util: smol json persistence +function ensureDirs(): void { + if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true }); +} + +// you can probably guess +function loadSnapshot(): void { + try { + ensureDirs(); + if (fs.existsSync(SNAPSHOT)) { + const raw = fs.readFileSync(SNAPSHOT, "utf8"); + const obj = JSON.parse(raw || "{}") as Record; + for (const [name, value] of Object.entries(obj)) db.set(name, value); + } + } catch (e) { + console.warn("failed to load snapshot:", e); + } +} + +// you can probably guess 2 +function saveSnapshot(): void { + try { + ensureDirs(); + const obj = Object.fromEntries(db.entries()); + fs.writeFileSync(SNAPSHOT, JSON.stringify(obj, null, 2)); + } catch (e) { + console.warn("failed to save snapshot:", e); + } +} + +loadSnapshot(); + +// basic text splitter (recursive by punctuation, then by length) +function chunkText(text: string, maxLen = 800): string[] { + const parts = text + .split(/\n{2,}/g) + .flatMap(p => p.split(/(?<=[.!?])\s+/g)) + .flatMap(s => s.length > maxLen ? s.match(new RegExp(`.{1,${maxLen}}`, "g")) || [] : [s]) + .map(s => s.trim()) + .filter(Boolean); + return parts; +} + +// cosine similarity +function dot(a: number[], b: number[]): number { let s = 0; for (let i = 0; i < a.length; i++) s += (a[i] || 0) * (b[i] || 0); return s; } +function norm(a: number[]): number { return Math.sqrt(dot(a, a)); } +function cosineSim(a: number[], b: number[]): number { const d = dot(a, b), n = norm(a) * norm(b) || 1; return d / n; } + +// call ollama embeddings +async function embedAll(texts: string[]): Promise { + const primary = await fetch(`${OLLAMA_BASE}/api/embed`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ model: OLLAMA_EMBED_MODEL, input: texts }) + }); + + if (primary.ok) { + const j: { embeddings: number[][] } = await primary.json(); + return j.embeddings; + } + + const results: number[][] = []; + for (const t of texts) { + const r = await fetch(`${OLLAMA_BASE}/api/embeddings`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ model: OLLAMA_EMBED_MODEL, prompt: t }) + }); + + if (!r.ok) throw new Error(`embed failed: ${r.status}`); + + const j: { embedding: number[] } = await r.json(); + results.push(j.embedding); + } + return results; +} + +// call ollama chat/generate with retrieved context +async function ollamaChat(req: OllamaChatRequest): Promise { + const res = await fetch(`${OLLAMA_BASE}/api/chat`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ model: req.model || OLLAMA_CHAT_MODEL, messages: req.messages, stream: req.stream }) + }); + + if (!res.ok) throw new Error(`ollama chat failed: ${res.status}`); + const j: OllamaChatResponse = await res.json(); + + return j; +} + +// openapi for open webui tool integration +const OPENAPI: OpenAPIObject = { + openapi: "3.1.0", + info: { title: "RAG Server (Ollama)", version: "1.0.0" }, + paths: { + "/collections": { + get: { operationId: "listCollections" }, + post: { operationId: "createCollection" } + }, + "/upsert": { post: { operationId: "upsert" } }, + "/query": { post: { operationId: "query" } }, + "/chat": { post: { operationId: "chat" } } + } +}; + +// tiny router +async function json(req: Request): Promise { try { return await req.json() as T; } catch { return {} as T; } } +function sendJson(_res: unknown, status: number, obj: unknown): Response { + return new Response(JSON.stringify(obj), { status, headers: { "content-type": "application/json; charset=utf-8" } }); +} + +async function handleCollections(req: Request): Promise { + if (req.method === "GET") { + return sendJson(null, 200, { collections: Array.from(db.keys()) }); + } + + if (req.method === "POST") { + const body = await json<{ name?: string }>(req), + name = String(body?.name || "").trim(); + + if (!name) return sendJson(null, 400, { error: "name required" }); + if (!db.has(name)) db.set(name, { name, chunks: [] }); + + saveSnapshot(); + return sendJson(null, 200, { ok: true }); + } + + return new Response("not found", { status: 404 }); +} + +async function handleUpsert(req: Request): Promise { + const body = await json<{ collection?: string; items?: UpsertInputItem[] }>(req), + collection = String(body?.collection || "").trim(), + items: UpsertInputItem[] = Array.isArray(body?.items) ? body.items : []; + + if (!collection) return sendJson(null, 400, { error: "collection required" }); + if (!db.has(collection)) db.set(collection, { name: collection, chunks: [] }); + + const col = db.get(collection)!, + chunksToIndex: { text: string; metadata?: Record; _id: string }[] = []; + + for (const it of items) { + const parts = chunkText(String(it.text || "")); + for (const p of parts) chunksToIndex.push({ text: p, metadata: it.metadata || {}, _id: crypto.randomUUID() }); + } + + const vecs = await embedAll(chunksToIndex.map(x => x.text)); + for (let i = 0; i < chunksToIndex.length; i++) { + const item = chunksToIndex[i], + doc: Chunk = { id: item._id, text: item.text, metadata: item.metadata, vector: vecs[i] }; + + col.chunks.push(doc); + } + + saveSnapshot(); + return sendJson(null, 200, { ok: true, indexed: chunksToIndex.length }); +} + +async function handleQuery(req: Request): Promise { + const body = await json<{ collection?: string; query?: string; topK?: number }>(req), + collection = String(body?.collection || "").trim(), + query = String(body?.query || "").trim(), + topK = Number(body?.topK || 5); + + if (!collection || !query) return sendJson(null, 400, { error: "collection and query required" }); + + const col = db.get(collection); + if (!col) return sendJson(null, 404, { error: "collection not found" }); + + const [qvec] = await embedAll([query]), + scored = col.chunks.map((c) => ({ c, score: cosineSim(qvec, c.vector) })) + .sort((a, b) => b.score - a.score) + .slice(0, topK) + .map(x => ({ id: x.c.id, text: x.c.text, metadata: x.c.metadata, score: x.score })); + return sendJson(null, 200, { matches: scored }); +} + +async function handleChat(req: Request): Promise { + const body = await json<{ collection?: string; query?: string; topK?: number; model?: string }>(req), + collection = String(body?.collection || "").trim(), + query = String(body?.query || "").trim(), + topK = Number(body?.topK || 5), + model = body?.model || OLLAMA_CHAT_MODEL; + + if (!collection || !query) return sendJson(null, 400, { error: "collection and query required" }); + + const col = db.get(collection); + if (!col) return sendJson(null, 404, { error: "collection not found" }); + + const [qvec] = await embedAll([query]), + matches = col.chunks.map((c) => ({ c, score: cosineSim(qvec, c.vector) })) + .sort((a, b) => b.score - a.score) + .slice(0, topK); + + const context = matches.map((m, i) => `[[doc ${i + 1} score=${m.score.toFixed(3)}]]\n${m.c.text}`).join("\n\n"), + system: string = `you are a helpful assistant. use ONLY the provided context to answer. if the answer isn't in the context, say you don't know. cite as [doc N].`, + user: string = `question: ${query}\n\ncontext:\n${context}`; + + const out = await ollamaChat({ model, messages: [{ role: "system", content: system }, { role: "user", content: user }], stream: false }); + return sendJson(null, 200, { + answer: out?.message?.content || "", + citations: matches.map((m, i) => ({ id: m.c.id, score: m.score, text: m.c.text })) + }); +} + +const pickFunc = (pathname: string) => { + switch (pathname) { + case "/collections": + return handleCollections; + case "/upsert": + return handleUpsert; + case "/query": + return handleQuery; + case "/chat": + return handleChat; + default: + return undefined; + } +} + +const server = serve({ + port: PORT, + hostname: HOST, + fetch: async (req: Request): Promise => { + const u = new URL(req.url); + if (req.method === "GET" && u.pathname === "/") return new Response("ok"); + if (req.method === "GET" && u.pathname === "/openapi.json") return sendJson(null, 200, OPENAPI); + return pickFunc(u.pathname)?.call(req) || new Response("not found", { status: 404 }); + } +}); + +console.log(`[rag] listening on http://${HOST}:${PORT}`); \ No newline at end of file