feat: add Agent-Initiative RAG with ChromaDB — 4 tools (store/search/list/inspect), Ollama embeddings, schema-agnostic metadata storage

2026-05-22 09:15:55 +07:00 · 2026-05-22 09:15:55 +07:00 · 534c4fccdc
commit 534c4fccdc
parent 58b9eda7f7
5 changed files with 298 additions and 8 deletions
--- a/config.py
+++ b/config.py
@ -12,3 +12,11 @@ llm_timeout           = int( os.getenv("LLM_TIMEOUT",           default="600"
 AGENT_MAX_ITERATIONS  = int( os.getenv("AGENT_MAX_ITERATIONS",  default="10"                         ) )
 # Tool Configuration (for future use)
 MAX_TOOL_OUTPUT       = int( os.getenv("MAX_TOOL_OUTPUT",       default="4000"                       ) )
 # RAG Configuration
 RAG_PERSIST_DIR       =       os.getenv("RAG_PERSIST_DIR",      default="chroma_db"                  )
 RAG_EMBEDDING_MODEL   =       os.getenv("RAG_EMBEDDING_MODEL",  default="nomic-embed-text"           )
 RAG_COLLECTIONS = {
    "food_recommendations": {
        "description": "Menu makanan, preferensi pelanggan, data kuliner"
    },
 }
--- a/hendrik.py
+++ b/hendrik.py
@ -2,7 +2,7 @@ import os, sys
 import config
 from scripts.llm_client import LLMClient
-from tools import coder
+from tools import coder, rag
 from scripts import gadget
 from tui import HendrikTUI
@ -14,6 +14,10 @@ tools_definition = [
    gadget.tools_mapping( schema = coder.schema_run_bash,         handler = coder.run_bash         ),
    gadget.tools_mapping( schema = coder.schema_search_code,      handler = coder.search_code      ),
    gadget.tools_mapping( schema = coder.schema_git_operation,    handler = coder.git_operation    ),
    gadget.tools_mapping( schema = rag.schema_store_knowledge,    handler = rag.store_knowledge    ),
    gadget.tools_mapping( schema = rag.schema_search_knowledge,   handler = rag.search_knowledge   ),
    gadget.tools_mapping( schema = rag.schema_list_collections,   handler = rag.list_collections   ),
    gadget.tools_mapping( schema = rag.schema_inspect_collection, handler = rag.inspect_collection ),
 ]
 # Ekstrak dari tools_definition ke dua format berbeda
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,2 @@
 python-dotenv>=1.0.0
 chromadb>=0.5.0
--- a/scripts/gadget.py
+++ b/scripts/gadget.py
@ -31,7 +31,16 @@ def build_system_prompt(tools_definition):
        "return it as plain text without tool calls.",
        "",
        f"Your workspace directory is: {os.getcwd()}. "
-        "All file operations are relative to this directory."
+        "All file operations are relative to this directory.",
        "",
        "RAG capabilities (knowledge retrieval):",
        "- list_collections       → see available knowledge bases.",
        "- inspect_collection     → learn metadata fields before searching.",
        "- search_knowledge       → semantic search + optional metadata filter.",
        "- store_knowledge        → save docs with rich metadata for later use.",
        "",
        "RAG workflow: inspect → search → reason. Always inspect a collection",
        "first to discover its metadata keys, then use them in search filters."
    ])
    return "\n".join(lines)
--- a/tools/rag.py
+++ b/tools/rag.py
@ -0,0 +1,268 @@
 import json
 import urllib.request
 import urllib.error
 from urllib.parse import urlparse
 import chromadb
 from chromadb.config import Settings
 import config
 # ── Embedding (Ollama) ───────────────────────────────────────────────
 from chromadb.api.types import EmbeddingFunction, Embeddings
 class OllamaEmbeddingFunction(EmbeddingFunction):
    def __init__(self, base_url, model):
        parsed = urlparse(base_url.rstrip('/'))
        self.ollama_base = f"{parsed.scheme}://{parsed.netloc}"
        self.model = model
    def __call__(self, input) -> Embeddings:
        url = f"{self.ollama_base}/api/embed"
        texts = input if isinstance(input, list) else [input]
        payload = {"model": self.model, "input": texts}
        data = json.dumps(payload).encode('utf-8')
        req = urllib.request.Request(url, data=data, method='POST')
        req.add_header('Content-Type', 'application/json')
        try:
            with urllib.request.urlopen(req, timeout=30) as resp:
                response = json.loads(resp.read().decode('utf-8'))
            return response["embeddings"]
        except Exception as e:
            raise RuntimeError(f"Embedding error: {e}")
 # ── ChromaDB singleton ───────────────────────────────────────────────
 _store = None
 _ef = None
 def _get_store():
    global _store
    if _store is None:
        _store = chromadb.PersistentClient(
            path=config.RAG_PERSIST_DIR,
            settings=Settings(anonymized_telemetry=False),
        )
    return _store
 def _get_ef():
    global _ef
    if _ef is None:
        _ef = OllamaEmbeddingFunction(config.llm_baseurl, config.RAG_EMBEDDING_MODEL)
    return _ef
 def _collection(name):
    if name not in config.RAG_COLLECTIONS:
        avail = ", ".join(config.RAG_COLLECTIONS)
        raise ValueError(f"Unknown collection '{name}'. Available: {avail}")
    return _get_store().get_or_create_collection(name=name, embedding_function=_get_ef())
 # ── Tool schemas ─────────────────────────────────────────────────────
 schema_store_knowledge = {
    "type": "function",
    "function": {
        "name": "store_knowledge",
        "description": (
            "Store one or more documents with arbitrary metadata into a RAG collection. "
            "Metadata is a free-form dict — choose meaningful keys for future filtering "
            "(e.g., restaurant, category, allergens, spice_level, taste_profile, price"
            ", customer_id, dietary)."
        ),
        "parameters": {
            "type": "object",
            "properties": {
                "collection": {
                    "type": "string",
                    "description": "Target collection name (must be defined in config)"
                },
                "documents": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "id": {"type": "string", "description": "Unique document ID"},
                            "text": {"type": "string", "description": "Document body text"},
                            "metadata": {
                                "type": "object",
                                "description": "Arbitrary key-value metadata",
                                "default": {}
                            }
                        },
                        "required": ["id", "text"]
                    },
                    "description": "List of documents to persist"
                }
            },
            "required": ["collection", "documents"]
        }
    }
 }
 schema_search_knowledge = {
    "type": "function",
    "function": {
        "name": "search_knowledge",
        "description": (
            "Semantically search a RAG collection. Optionally narrow with a "
            "metadata filter using ChromaDB where syntax. "
            "Examples: {'category': 'main_course'}, {'spice_level': {'$lte': 2}}, "
            "{'allergens': {'$contains': 'seafood'}}."
        ),
        "parameters": {
            "type": "object",
            "properties": {
                "collection": {
                    "type": "string",
                    "description": "Collection name to search in"
                },
                "query": {
                    "type": "string",
                    "description": "Natural-language search query"
                },
                "n_results": {
                    "type": "integer",
                    "description": "Max results to return (default 5)",
                    "default": 5
                },
                "filter": {
                    "type": "object",
                    "description": "Optional metadata filter dict",
                    "default": None
                }
            },
            "required": ["collection", "query"]
        }
    }
 }
 schema_list_collections = {
    "type": "function",
    "function": {
        "name": "list_collections",
        "description": "List all available RAG collections defined in config with their descriptions.",
        "parameters": {"type": "object", "properties": {}}
    }
 }
 schema_inspect_collection = {
    "type": "function",
    "function": {
        "name": "inspect_collection",
        "description": (
            "Examine sample documents and metadata fields in a RAG collection. "
            "Always call this before search_knowledge to learn what metadata keys "
            "are available for filtering, then pass them in the filter parameter."
        ),
        "parameters": {
            "type": "object",
            "properties": {
                "collection": {
                    "type": "string",
                    "description": "Collection name to inspect"
                },
                "sample_size": {
                    "type": "integer",
                    "description": "Number of sample documents (default 3)",
                    "default": 3
                }
            },
            "required": ["collection"]
        }
    }
 }
 # ── Tool handlers ────────────────────────────────────────────────────
 def _sanitize_meta(meta):
    """ChromaDB metadata only allows str/int/float/bool. Convert lists to JSON string, remove empty lists."""
    out = {}
    for k, v in meta.items():
        if isinstance(v, list):
            if len(v) == 0:
                continue
            out[k] = json.dumps(v, ensure_ascii=False)
        elif isinstance(v, (str, int, float, bool)):
            out[k] = v
        else:
            out[k] = str(v)
    return out
 def store_knowledge(collection, documents):
    try:
        col = _collection(collection)
        ids, texts, metas = [], [], []
        for doc in documents:
            ids.append(doc["id"])
            texts.append(doc["text"])
            metas.append(_sanitize_meta(doc.get("metadata", {})))
        col.add(ids=ids, documents=texts, metadatas=metas)
        return f"Stored {len(documents)} document(s) in '{collection}'."
    except Exception as e:
        return f"Error: {e}"
 def search_knowledge(collection, query, n_results=5, filter=None):
    try:
        col = _collection(collection)
        kw = {"query_texts": [query], "n_results": n_results}
        if filter:
            kw["where"] = filter
        r = col.query(**kw)
        if not r["ids"] or not r["ids"][0]:
            return "No results found."
        out = []
        for i in range(len(r["ids"][0])):
            did = r["ids"][0][i]
            txt = r["documents"][0][i]
            if len(txt) > 500:
                txt = txt[:500] + "..."
            meta = json.dumps(r["metadatas"][0][i], ensure_ascii=False) if r.get("metadatas") else "{}"
            dist = ""
            if r.get("distances"):
                dist = f" (score: {r['distances'][0][i]:.4f})"
            out.append(f"[{did}]{dist}\n  text: {txt}\n  metadata: {meta}")
        return "\n---\n".join(out)
    except Exception as e:
        return f"Error: {e}"
 def list_collections():
    try:
        if not config.RAG_COLLECTIONS:
            return "No collections defined in config."
        return "Available collections:\n" + "\n".join(
            f"- {n}: {i.get('description', '')}" for n, i in config.RAG_COLLECTIONS.items()
        )
    except Exception as e:
        return f"Error: {e}"
 def inspect_collection(collection, sample_size=3):
    try:
        col = _collection(collection)
        cnt = col.count()
        if cnt == 0:
            return f"Collection '{collection}' is empty."
        n = min(sample_size, cnt)
        r = col.get(limit=n, include=["documents", "metadatas"])
        out = [f"Collection: {collection}  |  Total documents: {cnt}", f"Sample ({n}):"]
        for i in range(len(r["ids"])):
            txt = r["documents"][i]
            if len(txt) > 200:
                txt = txt[:200] + "..."
            meta = json.dumps(r["metadatas"][i], ensure_ascii=False) if r.get("metadatas") and r["metadatas"][i] else "(none)"
            out.append(f"\n  [{r['ids'][i]}]  text: {txt}  metadata: {meta}")
        keys = set()
        for m in r["metadatas"]:
            if m:
                keys.update(m.keys())
        if keys:
            out.append(f"\nMetadata keys: {', '.join(sorted(keys))}")
        return "\n".join(out)
    except Exception as e:
        return f"Error: {e}"
`@ -1 +1,2 @@`
	`python-dotenv>=1.0.0`	`python-dotenv>=1.0.0`
		`chromadb>=0.5.0`