Unfinished chungking features

2026-06-08 09:27:38 +07:00 · 2026-06-08 09:27:38 +07:00 · 6e88d051bc
commit 6e88d051bc
parent d833774809
5 changed files with 196 additions and 5 deletions
--- a/hendrik.py
+++ b/hendrik.py
@ -14,6 +14,7 @@ tools_definition = [
    gadget.tools_mapping( schema = coder.schema_run_bash,         handler = coder.run_bash         ),
    gadget.tools_mapping( schema = coder.schema_search_code,      handler = coder.search_code      ),
    gadget.tools_mapping( schema = coder.schema_git_operation,    handler = coder.git_operation    ),
+    gadget.tools_mapping( schema = rag.schema_ingest_files,       handler = rag.ingest_files       ),
    gadget.tools_mapping( schema = rag.schema_store_knowledge,    handler = rag.store_knowledge    ),
    gadget.tools_mapping( schema = rag.schema_search_knowledge,   handler = rag.search_knowledge   ),
    gadget.tools_mapping( schema = rag.schema_create_collection,    handler = rag.create_collection    ),
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,3 @@
 python-dotenv>=1.0.0
 chromadb>=0.5.0
+openpyxl>=3.1.0
--- a/scripts/gadget.py
+++ b/scripts/gadget.py
@ -40,9 +40,10 @@ def build_system_prompt(tools_definition):
        "- inspect_collection     → learn metadata fields before searching.",
        "- search_knowledge       → semantic search + optional metadata filter.",
        "- store_knowledge        → save docs with rich metadata for later use.",
+        "- ingest_files           → read files (with glob patterns) into a collection, auto-chunking.",
        "",
        "You can create collections yourself! When you encounter a new topic,",
-        "use create_collection first, then store_knowledge to populate it.",
+        "use create_collection first, then store_knowledge or ingest_files to populate it.",
        "Always inspect_collection to discover metadata keys before filtering."
    ])
    return "\n".join(lines)
--- a/tools/rag.py
+++ b/tools/rag.py
@ -1,4 +1,7 @@
+import glob as globmod
 import json
+import os
+import time

 import chromadb
 from chromadb.config import Settings
@ -183,6 +186,50 @@ schema_inspect_collection = {
 }


+schema_ingest_files = {
+    "type": "function",
+    "function": {
+        "name": "ingest_files",
+        "description": (
+            "Read one or more files (supports glob patterns like *.py or src/**/*.md) "
+            "and store their content into a RAG collection. "
+            "Optionally chunk files into smaller pieces by line count. "
+            "Automatically extracts metadata: filename, path, extension, size, modification time."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "collection": {
+                    "type": "string",
+                    "description": "Target collection name (will be created if it doesn't exist)"
+                },
+                "paths": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "File paths or glob patterns (e.g., ['*.txt', 'src/**/*.py'])"
+                },
+                "chunk_size": {
+                    "type": "integer",
+                    "description": "Lines per chunk (0 = whole file as one document)",
+                    "default": 0
+                },
+                "chunk_overlap": {
+                    "type": "integer",
+                    "description": "Line overlap between chunks (only used when chunk_size > 0)",
+                    "default": 0
+                },
+                "recursive": {
+                    "type": "boolean",
+                    "description": "Search directories recursively when using glob patterns",
+                    "default": True
+                }
+            },
+            "required": ["collection", "paths"]
+        }
+    }
+}
+
+
 # ── Tool handlers ────────────────────────────────────────────────────

 def _sanitize_meta(meta):
@ -294,3 +341,144 @@ def inspect_collection(collection, sample_size=3):
        return "\n".join(out)
    except Exception as e:
        return f"Error: {e}"
+
+
+def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=True):
+    try:
+        col = _collection(collection)
+        all_ids, all_texts, all_metas = [], [], []
+        processed, skipped = 0, 0
+
+        # Expand glob patterns into real file paths
+        file_set = set()
+        for p in paths:
+            expanded = globmod.glob(p, recursive=recursive)
+            if expanded:
+                file_set.update(expanded)
+            else:
+                # Maybe it's a literal path that doesn't look like a glob
+                if os.path.isfile(p):
+                    file_set.add(p)
+                else:
+                    skipped += 1
+
+        if not file_set:
+            return "No matching files found."
+
+        for fpath in sorted(file_set):
+            if not os.path.isfile(fpath):
+                skipped += 1
+                continue
+
+            ext = os.path.splitext(fpath)[1].lower()
+            stat = os.stat(fpath)
+            base_meta = {
+                "filename": os.path.basename(fpath),
+                "path": os.path.relpath(fpath),
+                "extension": ext,
+                "size": stat.st_size,
+                "mtime": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat.st_mtime)),
+            }
+            base_name = os.path.splitext(os.path.basename(fpath))[0]
+
+            # ── read content ──────────────────────────────────────────
+            if ext in (".xlsx", ".xlsm"):
+                try:
+                    import openpyxl
+                except ImportError:
+                    skipped += 1
+                    continue
+
+                wb = openpyxl.load_workbook(fpath, read_only=True, data_only=True)
+                for sheet_name in wb.sheetnames:
+                    ws = wb[sheet_name]
+                    rows = []
+                    for row in ws.iter_rows(values_only=True):
+                        vals = [str(c) if c is not None else "" for c in row]
+                        rows.append("\t".join(vals))
+                    lines = rows
+                    content = "\n".join(lines)
+                    if not content.strip():
+                        continue
+
+                    sheet_meta = dict(base_meta)
+                    sheet_meta["sheet"] = sheet_name
+
+                    if chunk_size > 0:
+                        n_lines = len(lines)
+                        cid = 0
+                        start = 0
+                        while start < n_lines:
+                            end = min(start + chunk_size, n_lines)
+                            chunk_text = "\n".join(lines[start:end])
+                            doc_id = f"{base_name}_{sheet_name}_chunk_{cid}"
+                            meta = dict(sheet_meta)
+                            meta["chunk_index"] = cid
+                            meta["chunk_lines"] = end - start
+                            meta["chunk_start_line"] = start + 1
+                            all_ids.append(doc_id)
+                            all_texts.append(chunk_text)
+                            all_metas.append(_sanitize_meta(meta))
+                            cid += 1
+                            step = chunk_size - chunk_overlap
+                            start += step if step > 0 else 1
+                        processed += 1
+                    else:
+                        doc_id = f"{base_name}_{sheet_name}"
+                        all_ids.append(doc_id)
+                        all_texts.append(content)
+                        all_metas.append(_sanitize_meta(sheet_meta))
+                        processed += 1
+                wb.close()
+            else:
+                # Plain-text files
+                try:
+                    with open(fpath, "r", encoding="utf-8", errors="replace") as f:
+                        lines = f.readlines()
+                except Exception:
+                    skipped += 1
+                    continue
+
+                content = "".join(lines)
+                if not content.strip():
+                    skipped += 1
+                    continue
+
+                if chunk_size > 0:
+                    n_lines = len(lines)
+                    cid = 0
+                    start = 0
+                    while start < n_lines:
+                        end = min(start + chunk_size, n_lines)
+                        chunk_text = "".join(lines[start:end])
+                        doc_id = f"{base_name}_chunk_{cid}"
+                        meta = dict(base_meta)
+                        meta["chunk_index"] = cid
+                        meta["chunk_lines"] = end - start
+                        meta["chunk_start_line"] = start + 1
+                        all_ids.append(doc_id)
+                        all_texts.append(chunk_text)
+                        all_metas.append(_sanitize_meta(meta))
+                        cid += 1
+                        step = chunk_size - chunk_overlap
+                        start += step if step > 0 else 1
+                    processed += 1
+                else:
+                    doc_id = base_name
+                    all_ids.append(doc_id)
+                    all_texts.append(content)
+                    all_metas.append(_sanitize_meta(base_meta))
+                    processed += 1
+
+        if all_ids:
+            col.add(ids=all_ids, documents=all_texts, metadatas=all_metas)
+
+        parts = [f"Ingested {processed} file(s) into '{collection}'"]
+        if processed > 0:
+            parts.append(f"({len(all_ids)} document(s) total)")
+        if skipped > 0:
+            parts.append(f"({skipped} file(s) skipped)")
+        return " ".join(parts)
+
+    except Exception as e:
+        return f"Error: {e}"
--- a/tui/agent.py
+++ b/tui/agent.py
@ -7,10 +7,10 @@ WELCOME_ART = """\
 \n\
 ╔══════════════════════════════════════════╗
 ║                                          ║
-║     /\\_/\\       H E N D R I K            ║
-║    ( o.o )      AI Agent                 ║
-║     > ^ <       siap membantu!           ║
-║    (     )                               ║
+║     /\\_/\\                                ║
+║    ( o.o )      HENDRIK                  ║
+║     > ^ <                                ║
+║    (     )      AI Agent                 ║
 ║     (___)                                ║
 ║                                          ║
 ╚══════════════════════════════════════════╝"""