Unfinished chungking features

2026-06-08 09:27:38 +07:00 · 2026-06-08 09:27:38 +07:00 · 6e88d051bc
commit 6e88d051bc
parent d833774809
5 changed files with 196 additions and 5 deletions
--- a/hendrik.py
+++ b/hendrik.py
@ -14,6 +14,7 @@ tools_definition = [
    gadget.tools_mapping( schema = coder.schema_run_bash,         handler = coder.run_bash         ),
    gadget.tools_mapping( schema = coder.schema_search_code,      handler = coder.search_code      ),
    gadget.tools_mapping( schema = coder.schema_git_operation,    handler = coder.git_operation    ),
    gadget.tools_mapping( schema = rag.schema_ingest_files,       handler = rag.ingest_files       ),
    gadget.tools_mapping( schema = rag.schema_store_knowledge,    handler = rag.store_knowledge    ),
    gadget.tools_mapping( schema = rag.schema_search_knowledge,   handler = rag.search_knowledge   ),
    gadget.tools_mapping( schema = rag.schema_create_collection,    handler = rag.create_collection    ),
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,3 @@
 python-dotenv>=1.0.0
 chromadb>=0.5.0
 openpyxl>=3.1.0
--- a/scripts/gadget.py
+++ b/scripts/gadget.py
@ -40,9 +40,10 @@ def build_system_prompt(tools_definition):
        "- inspect_collection     → learn metadata fields before searching.",
        "- search_knowledge       → semantic search + optional metadata filter.",
        "- store_knowledge        → save docs with rich metadata for later use.",
        "- ingest_files           → read files (with glob patterns) into a collection, auto-chunking.",
        "",
        "You can create collections yourself! When you encounter a new topic,",
-        "use create_collection first, then store_knowledge to populate it.",
+        "use create_collection first, then store_knowledge or ingest_files to populate it.",
        "Always inspect_collection to discover metadata keys before filtering."
    ])
    return "\n".join(lines)
--- a/tools/rag.py
+++ b/tools/rag.py
@ -1,4 +1,7 @@
 import glob as globmod
 import json
 import os
 import time
 import chromadb
 from chromadb.config import Settings
@ -183,6 +186,50 @@ schema_inspect_collection = {
 }
 schema_ingest_files = {
    "type": "function",
    "function": {
        "name": "ingest_files",
        "description": (
            "Read one or more files (supports glob patterns like *.py or src/**/*.md) "
            "and store their content into a RAG collection. "
            "Optionally chunk files into smaller pieces by line count. "
            "Automatically extracts metadata: filename, path, extension, size, modification time."
        ),
        "parameters": {
            "type": "object",
            "properties": {
                "collection": {
                    "type": "string",
                    "description": "Target collection name (will be created if it doesn't exist)"
                },
                "paths": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "File paths or glob patterns (e.g., ['*.txt', 'src/**/*.py'])"
                },
                "chunk_size": {
                    "type": "integer",
                    "description": "Lines per chunk (0 = whole file as one document)",
                    "default": 0
                },
                "chunk_overlap": {
                    "type": "integer",
                    "description": "Line overlap between chunks (only used when chunk_size > 0)",
                    "default": 0
                },
                "recursive": {
                    "type": "boolean",
                    "description": "Search directories recursively when using glob patterns",
                    "default": True
                }
            },
            "required": ["collection", "paths"]
        }
    }
 }
 # ── Tool handlers ────────────────────────────────────────────────────
 def _sanitize_meta(meta):
@ -294,3 +341,144 @@ def inspect_collection(collection, sample_size=3):
        return "\n".join(out)
    except Exception as e:
        return f"Error: {e}"
 def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=True):
    try:
        col = _collection(collection)
        all_ids, all_texts, all_metas = [], [], []
        processed, skipped = 0, 0
        # Expand glob patterns into real file paths
        file_set = set()
        for p in paths:
            expanded = globmod.glob(p, recursive=recursive)
            if expanded:
                file_set.update(expanded)
            else:
                # Maybe it's a literal path that doesn't look like a glob
                if os.path.isfile(p):
                    file_set.add(p)
                else:
                    skipped += 1
        if not file_set:
            return "No matching files found."
        for fpath in sorted(file_set):
            if not os.path.isfile(fpath):
                skipped += 1
                continue
            ext = os.path.splitext(fpath)[1].lower()
            stat = os.stat(fpath)
            base_meta = {
                "filename": os.path.basename(fpath),
                "path": os.path.relpath(fpath),
                "extension": ext,
                "size": stat.st_size,
                "mtime": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat.st_mtime)),
            }
            base_name = os.path.splitext(os.path.basename(fpath))[0]
            # ── read content ──────────────────────────────────────────
            if ext in (".xlsx", ".xlsm"):
                try:
                    import openpyxl
                except ImportError:
                    skipped += 1
                    continue
                wb = openpyxl.load_workbook(fpath, read_only=True, data_only=True)
                for sheet_name in wb.sheetnames:
                    ws = wb[sheet_name]
                    rows = []
                    for row in ws.iter_rows(values_only=True):
                        vals = [str(c) if c is not None else "" for c in row]
                        rows.append("\t".join(vals))
                    lines = rows
                    content = "\n".join(lines)
                    if not content.strip():
                        continue
                    sheet_meta = dict(base_meta)
                    sheet_meta["sheet"] = sheet_name
                    if chunk_size > 0:
                        n_lines = len(lines)
                        cid = 0
                        start = 0
                        while start < n_lines:
                            end = min(start + chunk_size, n_lines)
                            chunk_text = "\n".join(lines[start:end])
                            doc_id = f"{base_name}_{sheet_name}_chunk_{cid}"
                            meta = dict(sheet_meta)
                            meta["chunk_index"] = cid
                            meta["chunk_lines"] = end - start
                            meta["chunk_start_line"] = start + 1
                            all_ids.append(doc_id)
                            all_texts.append(chunk_text)
                            all_metas.append(_sanitize_meta(meta))
                            cid += 1
                            step = chunk_size - chunk_overlap
                            start += step if step > 0 else 1
                        processed += 1
                    else:
                        doc_id = f"{base_name}_{sheet_name}"
                        all_ids.append(doc_id)
                        all_texts.append(content)
                        all_metas.append(_sanitize_meta(sheet_meta))
                        processed += 1
                wb.close()
            else:
                # Plain-text files
                try:
                    with open(fpath, "r", encoding="utf-8", errors="replace") as f:
                        lines = f.readlines()
                except Exception:
                    skipped += 1
                    continue
                content = "".join(lines)
                if not content.strip():
                    skipped += 1
                    continue
                if chunk_size > 0:
                    n_lines = len(lines)
                    cid = 0
                    start = 0
                    while start < n_lines:
                        end = min(start + chunk_size, n_lines)
                        chunk_text = "".join(lines[start:end])
                        doc_id = f"{base_name}_chunk_{cid}"
                        meta = dict(base_meta)
                        meta["chunk_index"] = cid
                        meta["chunk_lines"] = end - start
                        meta["chunk_start_line"] = start + 1
                        all_ids.append(doc_id)
                        all_texts.append(chunk_text)
                        all_metas.append(_sanitize_meta(meta))
                        cid += 1
                        step = chunk_size - chunk_overlap
                        start += step if step > 0 else 1
                    processed += 1
                else:
                    doc_id = base_name
                    all_ids.append(doc_id)
                    all_texts.append(content)
                    all_metas.append(_sanitize_meta(base_meta))
                    processed += 1
        if all_ids:
            col.add(ids=all_ids, documents=all_texts, metadatas=all_metas)
        parts = [f"Ingested {processed} file(s) into '{collection}'"]
        if processed > 0:
            parts.append(f"({len(all_ids)} document(s) total)")
        if skipped > 0:
            parts.append(f"({skipped} file(s) skipped)")
        return " ".join(parts)
    except Exception as e:
        return f"Error: {e}"
--- a/tui/agent.py
+++ b/tui/agent.py
@ -7,10 +7,10 @@ WELCOME_ART = """\
 \n\
 ╔══════════════════════════════════════════╗
 ║                                          ║
-║     /\\_/\\       H E N D R I K            ║
+║     /\\_/\\                                ║
-║    ( o.o )      AI Agent                 ║
+║    ( o.o )      HENDRIK                  ║
-║     > ^ <       siap membantu!           ║
+║     > ^ <                                ║
-║    (     )                               ║
+║    (     )      AI Agent                 ║
 ║     (___)                                ║
 ║                                          ║
 ╚══════════════════════════════════════════╝"""