Unfinished chungking features
This commit is contained in:
parent
d833774809
commit
6e88d051bc
@ -14,6 +14,7 @@ tools_definition = [
|
|||||||
gadget.tools_mapping( schema = coder.schema_run_bash, handler = coder.run_bash ),
|
gadget.tools_mapping( schema = coder.schema_run_bash, handler = coder.run_bash ),
|
||||||
gadget.tools_mapping( schema = coder.schema_search_code, handler = coder.search_code ),
|
gadget.tools_mapping( schema = coder.schema_search_code, handler = coder.search_code ),
|
||||||
gadget.tools_mapping( schema = coder.schema_git_operation, handler = coder.git_operation ),
|
gadget.tools_mapping( schema = coder.schema_git_operation, handler = coder.git_operation ),
|
||||||
|
gadget.tools_mapping( schema = rag.schema_ingest_files, handler = rag.ingest_files ),
|
||||||
gadget.tools_mapping( schema = rag.schema_store_knowledge, handler = rag.store_knowledge ),
|
gadget.tools_mapping( schema = rag.schema_store_knowledge, handler = rag.store_knowledge ),
|
||||||
gadget.tools_mapping( schema = rag.schema_search_knowledge, handler = rag.search_knowledge ),
|
gadget.tools_mapping( schema = rag.schema_search_knowledge, handler = rag.search_knowledge ),
|
||||||
gadget.tools_mapping( schema = rag.schema_create_collection, handler = rag.create_collection ),
|
gadget.tools_mapping( schema = rag.schema_create_collection, handler = rag.create_collection ),
|
||||||
|
|||||||
@ -1,2 +1,3 @@
|
|||||||
python-dotenv>=1.0.0
|
python-dotenv>=1.0.0
|
||||||
chromadb>=0.5.0
|
chromadb>=0.5.0
|
||||||
|
openpyxl>=3.1.0
|
||||||
|
|||||||
@ -40,9 +40,10 @@ def build_system_prompt(tools_definition):
|
|||||||
"- inspect_collection → learn metadata fields before searching.",
|
"- inspect_collection → learn metadata fields before searching.",
|
||||||
"- search_knowledge → semantic search + optional metadata filter.",
|
"- search_knowledge → semantic search + optional metadata filter.",
|
||||||
"- store_knowledge → save docs with rich metadata for later use.",
|
"- store_knowledge → save docs with rich metadata for later use.",
|
||||||
|
"- ingest_files → read files (with glob patterns) into a collection, auto-chunking.",
|
||||||
"",
|
"",
|
||||||
"You can create collections yourself! When you encounter a new topic,",
|
"You can create collections yourself! When you encounter a new topic,",
|
||||||
"use create_collection first, then store_knowledge to populate it.",
|
"use create_collection first, then store_knowledge or ingest_files to populate it.",
|
||||||
"Always inspect_collection to discover metadata keys before filtering."
|
"Always inspect_collection to discover metadata keys before filtering."
|
||||||
])
|
])
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|||||||
188
tools/rag.py
188
tools/rag.py
@ -1,4 +1,7 @@
|
|||||||
|
import glob as globmod
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
import chromadb
|
import chromadb
|
||||||
from chromadb.config import Settings
|
from chromadb.config import Settings
|
||||||
@ -183,6 +186,50 @@ schema_inspect_collection = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
schema_ingest_files = {
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "ingest_files",
|
||||||
|
"description": (
|
||||||
|
"Read one or more files (supports glob patterns like *.py or src/**/*.md) "
|
||||||
|
"and store their content into a RAG collection. "
|
||||||
|
"Optionally chunk files into smaller pieces by line count. "
|
||||||
|
"Automatically extracts metadata: filename, path, extension, size, modification time."
|
||||||
|
),
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"collection": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Target collection name (will be created if it doesn't exist)"
|
||||||
|
},
|
||||||
|
"paths": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": "File paths or glob patterns (e.g., ['*.txt', 'src/**/*.py'])"
|
||||||
|
},
|
||||||
|
"chunk_size": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Lines per chunk (0 = whole file as one document)",
|
||||||
|
"default": 0
|
||||||
|
},
|
||||||
|
"chunk_overlap": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Line overlap between chunks (only used when chunk_size > 0)",
|
||||||
|
"default": 0
|
||||||
|
},
|
||||||
|
"recursive": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Search directories recursively when using glob patterns",
|
||||||
|
"default": True
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["collection", "paths"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# ── Tool handlers ────────────────────────────────────────────────────
|
# ── Tool handlers ────────────────────────────────────────────────────
|
||||||
|
|
||||||
def _sanitize_meta(meta):
|
def _sanitize_meta(meta):
|
||||||
@ -294,3 +341,144 @@ def inspect_collection(collection, sample_size=3):
|
|||||||
return "\n".join(out)
|
return "\n".join(out)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"Error: {e}"
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=True):
|
||||||
|
try:
|
||||||
|
col = _collection(collection)
|
||||||
|
all_ids, all_texts, all_metas = [], [], []
|
||||||
|
processed, skipped = 0, 0
|
||||||
|
|
||||||
|
# Expand glob patterns into real file paths
|
||||||
|
file_set = set()
|
||||||
|
for p in paths:
|
||||||
|
expanded = globmod.glob(p, recursive=recursive)
|
||||||
|
if expanded:
|
||||||
|
file_set.update(expanded)
|
||||||
|
else:
|
||||||
|
# Maybe it's a literal path that doesn't look like a glob
|
||||||
|
if os.path.isfile(p):
|
||||||
|
file_set.add(p)
|
||||||
|
else:
|
||||||
|
skipped += 1
|
||||||
|
|
||||||
|
if not file_set:
|
||||||
|
return "No matching files found."
|
||||||
|
|
||||||
|
for fpath in sorted(file_set):
|
||||||
|
if not os.path.isfile(fpath):
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
ext = os.path.splitext(fpath)[1].lower()
|
||||||
|
stat = os.stat(fpath)
|
||||||
|
base_meta = {
|
||||||
|
"filename": os.path.basename(fpath),
|
||||||
|
"path": os.path.relpath(fpath),
|
||||||
|
"extension": ext,
|
||||||
|
"size": stat.st_size,
|
||||||
|
"mtime": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat.st_mtime)),
|
||||||
|
}
|
||||||
|
base_name = os.path.splitext(os.path.basename(fpath))[0]
|
||||||
|
|
||||||
|
# ── read content ──────────────────────────────────────────
|
||||||
|
if ext in (".xlsx", ".xlsm"):
|
||||||
|
try:
|
||||||
|
import openpyxl
|
||||||
|
except ImportError:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
wb = openpyxl.load_workbook(fpath, read_only=True, data_only=True)
|
||||||
|
for sheet_name in wb.sheetnames:
|
||||||
|
ws = wb[sheet_name]
|
||||||
|
rows = []
|
||||||
|
for row in ws.iter_rows(values_only=True):
|
||||||
|
vals = [str(c) if c is not None else "" for c in row]
|
||||||
|
rows.append("\t".join(vals))
|
||||||
|
lines = rows
|
||||||
|
content = "\n".join(lines)
|
||||||
|
if not content.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
sheet_meta = dict(base_meta)
|
||||||
|
sheet_meta["sheet"] = sheet_name
|
||||||
|
|
||||||
|
if chunk_size > 0:
|
||||||
|
n_lines = len(lines)
|
||||||
|
cid = 0
|
||||||
|
start = 0
|
||||||
|
while start < n_lines:
|
||||||
|
end = min(start + chunk_size, n_lines)
|
||||||
|
chunk_text = "\n".join(lines[start:end])
|
||||||
|
doc_id = f"{base_name}_{sheet_name}_chunk_{cid}"
|
||||||
|
meta = dict(sheet_meta)
|
||||||
|
meta["chunk_index"] = cid
|
||||||
|
meta["chunk_lines"] = end - start
|
||||||
|
meta["chunk_start_line"] = start + 1
|
||||||
|
all_ids.append(doc_id)
|
||||||
|
all_texts.append(chunk_text)
|
||||||
|
all_metas.append(_sanitize_meta(meta))
|
||||||
|
cid += 1
|
||||||
|
step = chunk_size - chunk_overlap
|
||||||
|
start += step if step > 0 else 1
|
||||||
|
processed += 1
|
||||||
|
else:
|
||||||
|
doc_id = f"{base_name}_{sheet_name}"
|
||||||
|
all_ids.append(doc_id)
|
||||||
|
all_texts.append(content)
|
||||||
|
all_metas.append(_sanitize_meta(sheet_meta))
|
||||||
|
processed += 1
|
||||||
|
wb.close()
|
||||||
|
else:
|
||||||
|
# Plain-text files
|
||||||
|
try:
|
||||||
|
with open(fpath, "r", encoding="utf-8", errors="replace") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
except Exception:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
content = "".join(lines)
|
||||||
|
if not content.strip():
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if chunk_size > 0:
|
||||||
|
n_lines = len(lines)
|
||||||
|
cid = 0
|
||||||
|
start = 0
|
||||||
|
while start < n_lines:
|
||||||
|
end = min(start + chunk_size, n_lines)
|
||||||
|
chunk_text = "".join(lines[start:end])
|
||||||
|
doc_id = f"{base_name}_chunk_{cid}"
|
||||||
|
meta = dict(base_meta)
|
||||||
|
meta["chunk_index"] = cid
|
||||||
|
meta["chunk_lines"] = end - start
|
||||||
|
meta["chunk_start_line"] = start + 1
|
||||||
|
all_ids.append(doc_id)
|
||||||
|
all_texts.append(chunk_text)
|
||||||
|
all_metas.append(_sanitize_meta(meta))
|
||||||
|
cid += 1
|
||||||
|
step = chunk_size - chunk_overlap
|
||||||
|
start += step if step > 0 else 1
|
||||||
|
processed += 1
|
||||||
|
else:
|
||||||
|
doc_id = base_name
|
||||||
|
all_ids.append(doc_id)
|
||||||
|
all_texts.append(content)
|
||||||
|
all_metas.append(_sanitize_meta(base_meta))
|
||||||
|
processed += 1
|
||||||
|
|
||||||
|
if all_ids:
|
||||||
|
col.add(ids=all_ids, documents=all_texts, metadatas=all_metas)
|
||||||
|
|
||||||
|
parts = [f"Ingested {processed} file(s) into '{collection}'"]
|
||||||
|
if processed > 0:
|
||||||
|
parts.append(f"({len(all_ids)} document(s) total)")
|
||||||
|
if skipped > 0:
|
||||||
|
parts.append(f"({skipped} file(s) skipped)")
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error: {e}"
|
||||||
|
|||||||
@ -7,10 +7,10 @@ WELCOME_ART = """\
|
|||||||
\n\
|
\n\
|
||||||
╔══════════════════════════════════════════╗
|
╔══════════════════════════════════════════╗
|
||||||
║ ║
|
║ ║
|
||||||
║ /\\_/\\ H E N D R I K ║
|
║ /\\_/\\ ║
|
||||||
║ ( o.o ) AI Agent ║
|
║ ( o.o ) HENDRIK ║
|
||||||
║ > ^ < siap membantu! ║
|
║ > ^ < ║
|
||||||
║ ( ) ║
|
║ ( ) AI Agent ║
|
||||||
║ (___) ║
|
║ (___) ║
|
||||||
║ ║
|
║ ║
|
||||||
╚══════════════════════════════════════════╝"""
|
╚══════════════════════════════════════════╝"""
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user