diff --git a/hendrik.py b/hendrik.py index e6be11c..815fd8f 100644 --- a/hendrik.py +++ b/hendrik.py @@ -14,6 +14,7 @@ tools_definition = [ gadget.tools_mapping( schema = coder.schema_run_bash, handler = coder.run_bash ), gadget.tools_mapping( schema = coder.schema_search_code, handler = coder.search_code ), gadget.tools_mapping( schema = coder.schema_git_operation, handler = coder.git_operation ), + gadget.tools_mapping( schema = rag.schema_ingest_files, handler = rag.ingest_files ), gadget.tools_mapping( schema = rag.schema_store_knowledge, handler = rag.store_knowledge ), gadget.tools_mapping( schema = rag.schema_search_knowledge, handler = rag.search_knowledge ), gadget.tools_mapping( schema = rag.schema_create_collection, handler = rag.create_collection ), diff --git a/requirements.txt b/requirements.txt index 3c2f1aa..9aca377 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ python-dotenv>=1.0.0 chromadb>=0.5.0 +openpyxl>=3.1.0 diff --git a/scripts/gadget.py b/scripts/gadget.py index d9f1031..3f1522b 100644 --- a/scripts/gadget.py +++ b/scripts/gadget.py @@ -40,9 +40,10 @@ def build_system_prompt(tools_definition): "- inspect_collection → learn metadata fields before searching.", "- search_knowledge → semantic search + optional metadata filter.", "- store_knowledge → save docs with rich metadata for later use.", + "- ingest_files → read files (with glob patterns) into a collection, auto-chunking.", "", "You can create collections yourself! When you encounter a new topic,", - "use create_collection first, then store_knowledge to populate it.", + "use create_collection first, then store_knowledge or ingest_files to populate it.", "Always inspect_collection to discover metadata keys before filtering." ]) return "\n".join(lines) diff --git a/tools/rag.py b/tools/rag.py index 2438143..0355fef 100644 --- a/tools/rag.py +++ b/tools/rag.py @@ -1,4 +1,7 @@ +import glob as globmod import json +import os +import time import chromadb from chromadb.config import Settings @@ -183,6 +186,50 @@ schema_inspect_collection = { } +schema_ingest_files = { + "type": "function", + "function": { + "name": "ingest_files", + "description": ( + "Read one or more files (supports glob patterns like *.py or src/**/*.md) " + "and store their content into a RAG collection. " + "Optionally chunk files into smaller pieces by line count. " + "Automatically extracts metadata: filename, path, extension, size, modification time." + ), + "parameters": { + "type": "object", + "properties": { + "collection": { + "type": "string", + "description": "Target collection name (will be created if it doesn't exist)" + }, + "paths": { + "type": "array", + "items": {"type": "string"}, + "description": "File paths or glob patterns (e.g., ['*.txt', 'src/**/*.py'])" + }, + "chunk_size": { + "type": "integer", + "description": "Lines per chunk (0 = whole file as one document)", + "default": 0 + }, + "chunk_overlap": { + "type": "integer", + "description": "Line overlap between chunks (only used when chunk_size > 0)", + "default": 0 + }, + "recursive": { + "type": "boolean", + "description": "Search directories recursively when using glob patterns", + "default": True + } + }, + "required": ["collection", "paths"] + } + } +} + + # ── Tool handlers ──────────────────────────────────────────────────── def _sanitize_meta(meta): @@ -294,3 +341,144 @@ def inspect_collection(collection, sample_size=3): return "\n".join(out) except Exception as e: return f"Error: {e}" + + +def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=True): + try: + col = _collection(collection) + all_ids, all_texts, all_metas = [], [], [] + processed, skipped = 0, 0 + + # Expand glob patterns into real file paths + file_set = set() + for p in paths: + expanded = globmod.glob(p, recursive=recursive) + if expanded: + file_set.update(expanded) + else: + # Maybe it's a literal path that doesn't look like a glob + if os.path.isfile(p): + file_set.add(p) + else: + skipped += 1 + + if not file_set: + return "No matching files found." + + for fpath in sorted(file_set): + if not os.path.isfile(fpath): + skipped += 1 + continue + + ext = os.path.splitext(fpath)[1].lower() + stat = os.stat(fpath) + base_meta = { + "filename": os.path.basename(fpath), + "path": os.path.relpath(fpath), + "extension": ext, + "size": stat.st_size, + "mtime": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat.st_mtime)), + } + base_name = os.path.splitext(os.path.basename(fpath))[0] + + # ── read content ────────────────────────────────────────── + if ext in (".xlsx", ".xlsm"): + try: + import openpyxl + except ImportError: + skipped += 1 + continue + + wb = openpyxl.load_workbook(fpath, read_only=True, data_only=True) + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + rows = [] + for row in ws.iter_rows(values_only=True): + vals = [str(c) if c is not None else "" for c in row] + rows.append("\t".join(vals)) + lines = rows + content = "\n".join(lines) + if not content.strip(): + continue + + sheet_meta = dict(base_meta) + sheet_meta["sheet"] = sheet_name + + if chunk_size > 0: + n_lines = len(lines) + cid = 0 + start = 0 + while start < n_lines: + end = min(start + chunk_size, n_lines) + chunk_text = "\n".join(lines[start:end]) + doc_id = f"{base_name}_{sheet_name}_chunk_{cid}" + meta = dict(sheet_meta) + meta["chunk_index"] = cid + meta["chunk_lines"] = end - start + meta["chunk_start_line"] = start + 1 + all_ids.append(doc_id) + all_texts.append(chunk_text) + all_metas.append(_sanitize_meta(meta)) + cid += 1 + step = chunk_size - chunk_overlap + start += step if step > 0 else 1 + processed += 1 + else: + doc_id = f"{base_name}_{sheet_name}" + all_ids.append(doc_id) + all_texts.append(content) + all_metas.append(_sanitize_meta(sheet_meta)) + processed += 1 + wb.close() + else: + # Plain-text files + try: + with open(fpath, "r", encoding="utf-8", errors="replace") as f: + lines = f.readlines() + except Exception: + skipped += 1 + continue + + content = "".join(lines) + if not content.strip(): + skipped += 1 + continue + + if chunk_size > 0: + n_lines = len(lines) + cid = 0 + start = 0 + while start < n_lines: + end = min(start + chunk_size, n_lines) + chunk_text = "".join(lines[start:end]) + doc_id = f"{base_name}_chunk_{cid}" + meta = dict(base_meta) + meta["chunk_index"] = cid + meta["chunk_lines"] = end - start + meta["chunk_start_line"] = start + 1 + all_ids.append(doc_id) + all_texts.append(chunk_text) + all_metas.append(_sanitize_meta(meta)) + cid += 1 + step = chunk_size - chunk_overlap + start += step if step > 0 else 1 + processed += 1 + else: + doc_id = base_name + all_ids.append(doc_id) + all_texts.append(content) + all_metas.append(_sanitize_meta(base_meta)) + processed += 1 + + if all_ids: + col.add(ids=all_ids, documents=all_texts, metadatas=all_metas) + + parts = [f"Ingested {processed} file(s) into '{collection}'"] + if processed > 0: + parts.append(f"({len(all_ids)} document(s) total)") + if skipped > 0: + parts.append(f"({skipped} file(s) skipped)") + return " ".join(parts) + + except Exception as e: + return f"Error: {e}" diff --git a/tui/agent.py b/tui/agent.py index 35add09..87646ae 100644 --- a/tui/agent.py +++ b/tui/agent.py @@ -7,10 +7,10 @@ WELCOME_ART = """\ \n\ ╔══════════════════════════════════════════╗ ║ ║ -║ /\\_/\\ H E N D R I K ║ -║ ( o.o ) AI Agent ║ -║ > ^ < siap membantu! ║ -║ ( ) ║ +║ /\\_/\\ ║ +║ ( o.o ) HENDRIK ║ +║ > ^ < ║ +║ ( ) AI Agent ║ ║ (___) ║ ║ ║ ╚══════════════════════════════════════════╝"""