Improving RAG

This commit is contained in:
Dita Aji Pratama 2026-06-24 17:11:03 +07:00
parent 059445de5e
commit 03221ca119
4 changed files with 167 additions and 108 deletions

View File

@ -122,7 +122,8 @@ SESSION_DB_PATH = os.path.expanduser(
# ─── RAG (YAML) ───────────────────────────────────────────────────────────────── # ─── RAG (YAML) ─────────────────────────────────────────────────────────────────
RAG_PERSIST_DIR = os.getenv("RAG_PERSIST_DIR", default=_yaml_get("rag", "persist_dir", default="chroma_db")) RAG_PERSIST_DIR = os.getenv("RAG_PERSIST_DIR", default=_yaml_get("rag", "persist_dir", default="lancedb_data"))
RAG_MODEL_PATH = os.getenv("RAG_MODEL_PATH", default=_yaml_get("rag", "model_path", default=""))
# ─── Humanize Delay (YAML) ───────────────────────────────────────────────────── # ─── Humanize Delay (YAML) ─────────────────────────────────────────────────────

View File

@ -34,7 +34,8 @@ llm:
- name : "z-ai/glm-5" - name : "z-ai/glm-5"
rag: rag:
persist_dir: chroma_db # ChromaDB ONNX default (all-MiniLM-L6-v2, local) persist_dir: "~/.config/hendrik/rag" # LanceDB Vector Store (all-MiniLM-L6-v2, local)
model_path: "~/.config/hendrik/models" # Custom path to store/load embedding model.
session: session:
db_path: "~/.config/hendrik/sessions.json" db_path: "~/.config/hendrik/sessions.json"
@ -50,8 +51,7 @@ telegram:
allowed_group_ids: "" # comma-separated, empty = all group allowed_group_ids: "" # comma-separated, empty = all group
selective_response: true # true = only response if mentioned/relevant selective_response: true # true = only response if mentioned/relevant
# Humanize Delay (anti-bot detection) delay: # Humanize Delay (anti-bot detection)
delay:
read_min: 1.0 # second read_min: 1.0 # second
read_max: 2.0 # second read_max: 2.0 # second
typing_speed: 15.0 # characters per second typing_speed: 15.0 # characters per second

View File

@ -5,3 +5,7 @@ openpyxl>=3.1.0
slixmpp slixmpp
python-telegram-bot>=20.0 python-telegram-bot>=20.0
tinydb>=4.8.0 tinydb>=4.8.0
lancedb
sentence-transformers
pandas
pylance

View File

@ -2,29 +2,78 @@ import glob as globmod
import json import json
import os import os
import time import time
import pandas as pd
import chromadb import lancedb
from chromadb.config import Settings from lancedb.pydantic import LanceModel
from sentence_transformers import SentenceTransformer
import config import config
# ── Embedding Setup ───────────────────────────────────────────────────────
# ── ChromaDB singleton ─────────────────────────────────────────────── def load_embedding_model():
"""
Logika pemuatan model embedding berdasarkan konfigurasi:
1. Jika model_path kosong -> gunakan default cache (~/.cache/...)
2. Jika model_path diisi tapi folder belum ada -> download lalu simpan ke folder tersebut
3. Jika model_path diisi dan folder sudah ada -> load langsung dari folder tersebut
"""
model_name = "all-MiniLM-L6-v2"
custom_path = config.RAG_MODEL_PATH.strip()
_store = None try:
if not custom_path:
# Kasus 1: Pakai default cache
print(f"[RAG] Loading embedding model '{model_name}' from default cache...")
return SentenceTransformer(model_name)
# Kasus 2 & 3: Menggunakan path kustom
if os.path.exists(custom_path):
print(f"[RAG] Loading embedding model from custom path: {custom_path}")
return SentenceTransformer(custom_path)
else:
print(f"[RAG] Custom path {custom_path} not found. Downloading model first...")
model = SentenceTransformer(model_name)
# Buat direktori jika belum ada
os.makedirs(custom_path, exist_ok=True)
model.save(custom_path)
print(f"[RAG] Model successfully downloaded and saved to: {custom_path}")
return model
except Exception as e:
print(f"[RAG] Critical Error loading embedding model: {e}")
return None
def _get_store(): # Inisialisasi model saat startup
global _store embedding_model = load_embedding_model()
if _store is None:
_store = chromadb.PersistentClient(
path=config.RAG_PERSIST_DIR,
settings=Settings(anonymized_telemetry=False),
)
return _store
def _collection(name): def get_embedding(text):
"""Get or create collection — uses ChromaDB's default ONNX embedding (all-MiniLM-L6-v2).""" """Fungsi standar untuk menghasilkan embedding"""
return _get_store().get_or_create_collection(name=name) if embedding_model is None:
raise Exception("Embedding model not loaded. Check your config or internet connection.")
return embedding_model.encode(text).tolist()
# Skema sederhana untuk menghindari konflik Pydantic
class DocumentSchema(LanceModel):
text: str
id: str
metadata: str
vector: list[float]
# ── LanceDB singleton ───────────────────────────────────────────────────────
_db = None
def _get_db():
global _db
if _db is None:
_db = lancedb.connect(config.RAG_PERSIST_DIR)
return _db
def _get_table(name):
db = _get_db()
if name in db.table_names():
return db.open_table(name)
return db.create_table(name, schema=DocumentSchema)
# ── Tool schemas ───────────────────────────────────────────────────── # ── Tool schemas ─────────────────────────────────────────────────────
@ -74,9 +123,8 @@ schema_search_knowledge = {
"name": "search_knowledge", "name": "search_knowledge",
"description": ( "description": (
"Semantically search a RAG collection. Optionally narrow with a " "Semantically search a RAG collection. Optionally narrow with a "
"metadata filter using ChromaDB where syntax. " "metadata filter using SQL-like syntax. "
"Examples: {'category': 'main_course'}, {'spice_level': {'$lte': 2}}, " "Example: \"metadata LIKE '%main_course%'\""
"{'allergens': {'$contains': 'seafood'}}."
), ),
"parameters": { "parameters": {
"type": "object", "type": "object",
@ -95,8 +143,8 @@ schema_search_knowledge = {
"default": 5 "default": 5
}, },
"filter": { "filter": {
"type": "object", "type": "string",
"description": "Optional metadata filter dict", "description": "Optional SQL-like filter for metadata JSON string",
"default": None "default": None
} }
}, },
@ -185,7 +233,6 @@ schema_inspect_collection = {
} }
} }
schema_ingest_files = { schema_ingest_files = {
"type": "function", "type": "function",
"function": { "function": {
@ -229,138 +276,134 @@ schema_ingest_files = {
} }
} }
# ── Tool handlers ──────────────────────────────────────────────────── # ── Tool handlers ────────────────────────────────────────────────────
def _sanitize_meta(meta):
"""ChromaDB metadata only allows str/int/float/bool. Convert lists to JSON string, remove empty lists."""
out = {}
for k, v in meta.items():
if isinstance(v, list):
if len(v) == 0:
continue
out[k] = json.dumps(v, ensure_ascii=False)
elif isinstance(v, (str, int, float, bool)):
out[k] = v
else:
out[k] = str(v)
return out
def store_knowledge(collection, documents): def store_knowledge(collection, documents):
try: try:
col = _collection(collection) table = _get_table(collection)
ids, texts, metas = [], [], [] data = []
for doc in documents: for doc in documents:
ids.append(doc["id"]) data.append({
texts.append(doc["text"]) "id": doc["id"],
metas.append(_sanitize_meta(doc.get("metadata", {}))) "text": doc["text"],
col.add(ids=ids, documents=texts, metadatas=metas) "metadata": json.dumps(doc.get("metadata", {}), ensure_ascii=False),
"vector": get_embedding(doc["text"])
})
table.add(data)
return f"Stored {len(documents)} document(s) in '{collection}'." return f"Stored {len(documents)} document(s) in '{collection}'."
except Exception as e: except Exception as e:
return f"Error: {e}" return f"Error: {e}"
def search_knowledge(collection, query, n_results=5, filter=None): def search_knowledge(collection, query, n_results=5, filter=None):
try: try:
col = _collection(collection) table = _get_table(collection)
kw = {"query_texts": [query], "n_results": n_results} # LanceDB semantic search
query_vector = get_embedding(query)
res = table.search(query_vector).limit(n_results)
if filter: if filter:
kw["where"] = filter res = table.search(query_vector).where(filter).limit(n_results)
r = col.query(**kw)
if not r["ids"] or not r["ids"][0]: df = res.to_pandas()
if df.empty:
return "No results found." return "No results found."
out = [] out = []
for i in range(len(r["ids"][0])): for _, row in df.iterrows():
did = r["ids"][0][i] did = row["id"]
txt = r["documents"][0][i] txt = row["text"]
if len(txt) > 500: if len(txt) > 500:
txt = txt[:500] + "..." txt = txt[:500] + "..."
meta = json.dumps(r["metadatas"][0][i], ensure_ascii=False) if r.get("metadatas") else "{}" meta = row["metadata"]
dist = "" out.append(f"[{did}]\n text: {txt}\n metadata: {meta}")
if r.get("distances"):
dist = f" (score: {r['distances'][0][i]:.4f})"
out.append(f"[{did}]{dist}\n text: {txt}\n metadata: {meta}")
return "\n---\n".join(out) return "\n---\n".join(out)
except Exception as e: except Exception as e:
return f"Error: {e}" return f"Error: {e}"
def create_collection(name, description=""): def create_collection(name, description=""):
try: try:
col = _get_store().get_or_create_collection(name=name) _get_table(name)
col.modify(metadata={"description": description})
return f"Collection '{name}' is ready." return f"Collection '{name}' is ready."
except Exception as e: except Exception as e:
return f"Error: {e}" return f"Error: {e}"
def delete_collection(name): def delete_collection(name):
try: try:
_get_store().delete_collection(name) db = _get_db()
table_path = os.path.join(config.RAG_PERSIST_DIR, name)
if os.path.exists(table_path):
import shutil
shutil.rmtree(table_path)
return f"Deleted collection '{name}'." return f"Deleted collection '{name}'."
except Exception as e: except Exception as e:
return f"Error: {e}" return f"Error: {e}"
def list_collections(): def list_collections():
try: try:
cols = _get_store().list_collections() db = _get_db()
cols = db.table_names()
if not cols: if not cols:
return "No collections exist yet." return "No collections exist yet."
out = ["Available collections:"] out = ["Available collections:"]
for col in cols: for col in cols:
meta = col.metadata or {} table = db.open_table(col)
desc = meta.get("description", "") cnt = len(table.to_pandas())
cnt = col.count() out.append(f"- {col} [{cnt} docs]")
tag = f" ({desc})" if desc else ""
out.append(f"- {col.name}{tag} [{cnt} docs]")
return "\n".join(out) return "\n".join(out)
except Exception as e: except Exception as e:
return f"Error: {e}" return f"Error: {e}"
def inspect_collection(collection, sample_size=3): def inspect_collection(collection, sample_size=3):
try: try:
col = _collection(collection) table = _get_table(collection)
cnt = col.count() df = table.to_pandas()
cnt = len(df)
if cnt == 0: if cnt == 0:
return f"Collection '{collection}' is empty." return f"Collection '{collection}' is empty."
n = min(sample_size, cnt) n = min(sample_size, cnt)
r = col.get(limit=n, include=["documents", "metadatas"]) sample = df.head(n)
out = [f"Collection: {collection} | Total documents: {cnt}", f"Sample ({n}):"] out = [f"Collection: {collection} | Total documents: {cnt}", f"Sample ({n}):"]
for i in range(len(r["ids"])): for _, row in sample.iterrows():
txt = r["documents"][i] txt = row["text"]
if len(txt) > 200: if len(txt) > 200:
txt = txt[:200] + "..." txt = txt[:200] + "..."
meta = json.dumps(r["metadatas"][i], ensure_ascii=False) if r.get("metadatas") and r["metadatas"][i] else "(none)" meta = row["metadata"]
out.append(f"\n [{r['ids'][i]}] text: {txt} metadata: {meta}") out.append(f"\n [{row['id']}] text: {txt} metadata: {meta}")
keys = set() keys = set()
for m in r["metadatas"]: for m_str in sample["metadata"]:
if m: try:
keys.update(m.keys()) m_dict = json.loads(m_str)
keys.update(m_dict.keys())
except:
pass
if keys: if keys:
out.append(f"\nMetadata keys: {', '.join(sorted(keys))}") out.append(f"\nMetadata keys: {', '.join(sorted(keys))}")
return "\n".join(out) return "\n".join(out)
except Exception as e: except Exception as e:
return f"Error: {e}" return f"Error: {e}"
def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=True): def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=True):
try: try:
col = _collection(collection) table = _get_table(collection)
all_ids, all_texts, all_metas = [], [], [] all_data = []
processed, skipped = 0, 0 processed, skipped = 0, 0
# Expand glob patterns into real file paths
file_set = set() file_set = set()
for p in paths: for p in paths:
expanded = globmod.glob(p, recursive=recursive) expanded = globmod.glob(p, recursive=recursive)
if expanded: if expanded:
file_set.update(expanded) file_set.update(expanded)
elif os.path.isfile(p):
file_set.add(p)
else: else:
# Maybe it's a literal path that doesn't look like a glob skipped += 1
if os.path.isfile(p):
file_set.add(p)
else:
skipped += 1
if not file_set: if not file_set:
return "No matching files found." return "No matching files found."
@ -381,7 +424,6 @@ def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=Tru
} }
base_name = os.path.splitext(os.path.basename(fpath))[0] base_name = os.path.splitext(os.path.basename(fpath))[0]
# ── read content ──────────────────────────────────────────
if ext in (".xlsx", ".xlsm"): if ext in (".xlsx", ".xlsm"):
try: try:
import openpyxl import openpyxl
@ -396,6 +438,7 @@ def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=Tru
for row in ws.iter_rows(values_only=True): for row in ws.iter_rows(values_only=True):
vals = [str(c) if c is not None else "" for c in row] vals = [str(c) if c is not None else "" for c in row]
rows.append("\t".join(vals)) rows.append("\t".join(vals))
lines = rows lines = rows
content = "\n".join(lines) content = "\n".join(lines)
if not content.strip(): if not content.strip():
@ -416,22 +459,27 @@ def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=Tru
meta["chunk_index"] = cid meta["chunk_index"] = cid
meta["chunk_lines"] = end - start meta["chunk_lines"] = end - start
meta["chunk_start_line"] = start + 1 meta["chunk_start_line"] = start + 1
all_ids.append(doc_id) all_data.append({
all_texts.append(chunk_text) "id": doc_id,
all_metas.append(_sanitize_meta(meta)) "text": chunk_text,
"metadata": json.dumps(meta, ensure_ascii=False),
"vector": get_embedding(chunk_text)
})
cid += 1 cid += 1
step = chunk_size - chunk_overlap step = chunk_size - chunk_overlap
start += step if step > 0 else 1 start += step if step > 0 else 1
processed += 1 processed += 1
else: else:
doc_id = f"{base_name}_{sheet_name}" doc_id = f"{base_name}_{sheet_name}"
all_ids.append(doc_id) all_data.append({
all_texts.append(content) "id": doc_id,
all_metas.append(_sanitize_meta(sheet_meta)) "text": content,
"metadata": json.dumps(sheet_meta, ensure_ascii=False),
"vector": get_embedding(content)
})
processed += 1 processed += 1
wb.close() wb.close()
else: else:
# Plain-text files
try: try:
with open(fpath, "r", encoding="utf-8", errors="replace") as f: with open(fpath, "r", encoding="utf-8", errors="replace") as f:
lines = f.readlines() lines = f.readlines()
@ -456,26 +504,32 @@ def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=Tru
meta["chunk_index"] = cid meta["chunk_index"] = cid
meta["chunk_lines"] = end - start meta["chunk_lines"] = end - start
meta["chunk_start_line"] = start + 1 meta["chunk_start_line"] = start + 1
all_ids.append(doc_id) all_data.append({
all_texts.append(chunk_text) "id": doc_id,
all_metas.append(_sanitize_meta(meta)) "text": chunk_text,
"metadata": json.dumps(meta, ensure_ascii=False),
"vector": get_embedding(chunk_text)
})
cid += 1 cid += 1
step = chunk_size - chunk_overlap step = chunk_size - chunk_overlap
start += step if step > 0 else 1 start += step if step > 0 else 1
processed += 1 processed += 1
else: else:
doc_id = base_name doc_id = base_name
all_ids.append(doc_id) all_data.append({
all_texts.append(content) "id": doc_id,
all_metas.append(_sanitize_meta(base_meta)) "text": content,
"metadata": json.dumps(base_meta, ensure_ascii=False),
"vector": get_embedding(content)
})
processed += 1 processed += 1
if all_ids: if all_data:
col.add(ids=all_ids, documents=all_texts, metadatas=all_metas) table.add(all_data)
parts = [f"Ingested {processed} file(s) into '{collection}'"] parts = [f"Ingested {processed} file(s) into '{collection}'"]
if processed > 0: if processed > 0:
parts.append(f"({len(all_ids)} document(s) total)") parts.append(f"({len(all_data)} document(s) total)")
if skipped > 0: if skipped > 0:
parts.append(f"({skipped} file(s) skipped)") parts.append(f"({skipped} file(s) skipped)")
return " ".join(parts) return " ".join(parts)