Unfinished chungking features

This commit is contained in:
Dita Aji Pratama 2026-06-08 09:27:38 +07:00
parent d833774809
commit 6e88d051bc
5 changed files with 196 additions and 5 deletions

View File

@ -14,6 +14,7 @@ tools_definition = [
gadget.tools_mapping( schema = coder.schema_run_bash, handler = coder.run_bash ),
gadget.tools_mapping( schema = coder.schema_search_code, handler = coder.search_code ),
gadget.tools_mapping( schema = coder.schema_git_operation, handler = coder.git_operation ),
gadget.tools_mapping( schema = rag.schema_ingest_files, handler = rag.ingest_files ),
gadget.tools_mapping( schema = rag.schema_store_knowledge, handler = rag.store_knowledge ),
gadget.tools_mapping( schema = rag.schema_search_knowledge, handler = rag.search_knowledge ),
gadget.tools_mapping( schema = rag.schema_create_collection, handler = rag.create_collection ),

View File

@ -1,2 +1,3 @@
python-dotenv>=1.0.0
chromadb>=0.5.0
openpyxl>=3.1.0

View File

@ -40,9 +40,10 @@ def build_system_prompt(tools_definition):
"- inspect_collection → learn metadata fields before searching.",
"- search_knowledge → semantic search + optional metadata filter.",
"- store_knowledge → save docs with rich metadata for later use.",
"- ingest_files → read files (with glob patterns) into a collection, auto-chunking.",
"",
"You can create collections yourself! When you encounter a new topic,",
"use create_collection first, then store_knowledge to populate it.",
"use create_collection first, then store_knowledge or ingest_files to populate it.",
"Always inspect_collection to discover metadata keys before filtering."
])
return "\n".join(lines)

View File

@ -1,4 +1,7 @@
import glob as globmod
import json
import os
import time
import chromadb
from chromadb.config import Settings
@ -183,6 +186,50 @@ schema_inspect_collection = {
}
schema_ingest_files = {
"type": "function",
"function": {
"name": "ingest_files",
"description": (
"Read one or more files (supports glob patterns like *.py or src/**/*.md) "
"and store their content into a RAG collection. "
"Optionally chunk files into smaller pieces by line count. "
"Automatically extracts metadata: filename, path, extension, size, modification time."
),
"parameters": {
"type": "object",
"properties": {
"collection": {
"type": "string",
"description": "Target collection name (will be created if it doesn't exist)"
},
"paths": {
"type": "array",
"items": {"type": "string"},
"description": "File paths or glob patterns (e.g., ['*.txt', 'src/**/*.py'])"
},
"chunk_size": {
"type": "integer",
"description": "Lines per chunk (0 = whole file as one document)",
"default": 0
},
"chunk_overlap": {
"type": "integer",
"description": "Line overlap between chunks (only used when chunk_size > 0)",
"default": 0
},
"recursive": {
"type": "boolean",
"description": "Search directories recursively when using glob patterns",
"default": True
}
},
"required": ["collection", "paths"]
}
}
}
# ── Tool handlers ────────────────────────────────────────────────────
def _sanitize_meta(meta):
@ -294,3 +341,144 @@ def inspect_collection(collection, sample_size=3):
return "\n".join(out)
except Exception as e:
return f"Error: {e}"
def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=True):
try:
col = _collection(collection)
all_ids, all_texts, all_metas = [], [], []
processed, skipped = 0, 0
# Expand glob patterns into real file paths
file_set = set()
for p in paths:
expanded = globmod.glob(p, recursive=recursive)
if expanded:
file_set.update(expanded)
else:
# Maybe it's a literal path that doesn't look like a glob
if os.path.isfile(p):
file_set.add(p)
else:
skipped += 1
if not file_set:
return "No matching files found."
for fpath in sorted(file_set):
if not os.path.isfile(fpath):
skipped += 1
continue
ext = os.path.splitext(fpath)[1].lower()
stat = os.stat(fpath)
base_meta = {
"filename": os.path.basename(fpath),
"path": os.path.relpath(fpath),
"extension": ext,
"size": stat.st_size,
"mtime": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat.st_mtime)),
}
base_name = os.path.splitext(os.path.basename(fpath))[0]
# ── read content ──────────────────────────────────────────
if ext in (".xlsx", ".xlsm"):
try:
import openpyxl
except ImportError:
skipped += 1
continue
wb = openpyxl.load_workbook(fpath, read_only=True, data_only=True)
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
rows = []
for row in ws.iter_rows(values_only=True):
vals = [str(c) if c is not None else "" for c in row]
rows.append("\t".join(vals))
lines = rows
content = "\n".join(lines)
if not content.strip():
continue
sheet_meta = dict(base_meta)
sheet_meta["sheet"] = sheet_name
if chunk_size > 0:
n_lines = len(lines)
cid = 0
start = 0
while start < n_lines:
end = min(start + chunk_size, n_lines)
chunk_text = "\n".join(lines[start:end])
doc_id = f"{base_name}_{sheet_name}_chunk_{cid}"
meta = dict(sheet_meta)
meta["chunk_index"] = cid
meta["chunk_lines"] = end - start
meta["chunk_start_line"] = start + 1
all_ids.append(doc_id)
all_texts.append(chunk_text)
all_metas.append(_sanitize_meta(meta))
cid += 1
step = chunk_size - chunk_overlap
start += step if step > 0 else 1
processed += 1
else:
doc_id = f"{base_name}_{sheet_name}"
all_ids.append(doc_id)
all_texts.append(content)
all_metas.append(_sanitize_meta(sheet_meta))
processed += 1
wb.close()
else:
# Plain-text files
try:
with open(fpath, "r", encoding="utf-8", errors="replace") as f:
lines = f.readlines()
except Exception:
skipped += 1
continue
content = "".join(lines)
if not content.strip():
skipped += 1
continue
if chunk_size > 0:
n_lines = len(lines)
cid = 0
start = 0
while start < n_lines:
end = min(start + chunk_size, n_lines)
chunk_text = "".join(lines[start:end])
doc_id = f"{base_name}_chunk_{cid}"
meta = dict(base_meta)
meta["chunk_index"] = cid
meta["chunk_lines"] = end - start
meta["chunk_start_line"] = start + 1
all_ids.append(doc_id)
all_texts.append(chunk_text)
all_metas.append(_sanitize_meta(meta))
cid += 1
step = chunk_size - chunk_overlap
start += step if step > 0 else 1
processed += 1
else:
doc_id = base_name
all_ids.append(doc_id)
all_texts.append(content)
all_metas.append(_sanitize_meta(base_meta))
processed += 1
if all_ids:
col.add(ids=all_ids, documents=all_texts, metadatas=all_metas)
parts = [f"Ingested {processed} file(s) into '{collection}'"]
if processed > 0:
parts.append(f"({len(all_ids)} document(s) total)")
if skipped > 0:
parts.append(f"({skipped} file(s) skipped)")
return " ".join(parts)
except Exception as e:
return f"Error: {e}"

View File

@ -7,10 +7,10 @@ WELCOME_ART = """\
\n\
/\\_/\\ H E N D R I K
( o.o ) AI Agent
> ^ < siap membantu!
( )
/\\_/\\
( o.o ) HENDRIK
> ^ <
( ) AI Agent
(___)
"""