Improving RAG
This commit is contained in:
parent
059445de5e
commit
03221ca119
@ -122,7 +122,8 @@ SESSION_DB_PATH = os.path.expanduser(
|
|||||||
|
|
||||||
# ─── RAG (YAML) ─────────────────────────────────────────────────────────────────
|
# ─── RAG (YAML) ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
RAG_PERSIST_DIR = os.getenv("RAG_PERSIST_DIR", default=_yaml_get("rag", "persist_dir", default="chroma_db"))
|
RAG_PERSIST_DIR = os.getenv("RAG_PERSIST_DIR", default=_yaml_get("rag", "persist_dir", default="lancedb_data"))
|
||||||
|
RAG_MODEL_PATH = os.getenv("RAG_MODEL_PATH", default=_yaml_get("rag", "model_path", default=""))
|
||||||
|
|
||||||
|
|
||||||
# ─── Humanize Delay (YAML) ─────────────────────────────────────────────────────
|
# ─── Humanize Delay (YAML) ─────────────────────────────────────────────────────
|
||||||
|
|||||||
@ -34,7 +34,8 @@ llm:
|
|||||||
- name : "z-ai/glm-5"
|
- name : "z-ai/glm-5"
|
||||||
|
|
||||||
rag:
|
rag:
|
||||||
persist_dir: chroma_db # ChromaDB ONNX default (all-MiniLM-L6-v2, local)
|
persist_dir: "~/.config/hendrik/rag" # LanceDB Vector Store (all-MiniLM-L6-v2, local)
|
||||||
|
model_path: "~/.config/hendrik/models" # Custom path to store/load embedding model.
|
||||||
|
|
||||||
session:
|
session:
|
||||||
db_path: "~/.config/hendrik/sessions.json"
|
db_path: "~/.config/hendrik/sessions.json"
|
||||||
@ -50,8 +51,7 @@ telegram:
|
|||||||
allowed_group_ids: "" # comma-separated, empty = all group
|
allowed_group_ids: "" # comma-separated, empty = all group
|
||||||
selective_response: true # true = only response if mentioned/relevant
|
selective_response: true # true = only response if mentioned/relevant
|
||||||
|
|
||||||
# Humanize Delay (anti-bot detection)
|
delay: # Humanize Delay (anti-bot detection)
|
||||||
delay:
|
|
||||||
read_min: 1.0 # second
|
read_min: 1.0 # second
|
||||||
read_max: 2.0 # second
|
read_max: 2.0 # second
|
||||||
typing_speed: 15.0 # characters per second
|
typing_speed: 15.0 # characters per second
|
||||||
|
|||||||
@ -5,3 +5,7 @@ openpyxl>=3.1.0
|
|||||||
slixmpp
|
slixmpp
|
||||||
python-telegram-bot>=20.0
|
python-telegram-bot>=20.0
|
||||||
tinydb>=4.8.0
|
tinydb>=4.8.0
|
||||||
|
lancedb
|
||||||
|
sentence-transformers
|
||||||
|
pandas
|
||||||
|
pylance
|
||||||
|
|||||||
262
tools/rag.py
262
tools/rag.py
@ -2,29 +2,78 @@ import glob as globmod
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
import pandas as pd
|
||||||
import chromadb
|
import lancedb
|
||||||
from chromadb.config import Settings
|
from lancedb.pydantic import LanceModel
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
import config
|
import config
|
||||||
|
|
||||||
|
# ── Embedding Setup ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
# ── ChromaDB singleton ───────────────────────────────────────────────
|
def load_embedding_model():
|
||||||
|
"""
|
||||||
|
Logika pemuatan model embedding berdasarkan konfigurasi:
|
||||||
|
1. Jika model_path kosong -> gunakan default cache (~/.cache/...)
|
||||||
|
2. Jika model_path diisi tapi folder belum ada -> download lalu simpan ke folder tersebut
|
||||||
|
3. Jika model_path diisi dan folder sudah ada -> load langsung dari folder tersebut
|
||||||
|
"""
|
||||||
|
model_name = "all-MiniLM-L6-v2"
|
||||||
|
custom_path = config.RAG_MODEL_PATH.strip()
|
||||||
|
|
||||||
_store = None
|
try:
|
||||||
|
if not custom_path:
|
||||||
|
# Kasus 1: Pakai default cache
|
||||||
|
print(f"[RAG] Loading embedding model '{model_name}' from default cache...")
|
||||||
|
return SentenceTransformer(model_name)
|
||||||
|
|
||||||
|
# Kasus 2 & 3: Menggunakan path kustom
|
||||||
|
if os.path.exists(custom_path):
|
||||||
|
print(f"[RAG] Loading embedding model from custom path: {custom_path}")
|
||||||
|
return SentenceTransformer(custom_path)
|
||||||
|
else:
|
||||||
|
print(f"[RAG] Custom path {custom_path} not found. Downloading model first...")
|
||||||
|
model = SentenceTransformer(model_name)
|
||||||
|
# Buat direktori jika belum ada
|
||||||
|
os.makedirs(custom_path, exist_ok=True)
|
||||||
|
model.save(custom_path)
|
||||||
|
print(f"[RAG] Model successfully downloaded and saved to: {custom_path}")
|
||||||
|
return model
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[RAG] Critical Error loading embedding model: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
def _get_store():
|
# Inisialisasi model saat startup
|
||||||
global _store
|
embedding_model = load_embedding_model()
|
||||||
if _store is None:
|
|
||||||
_store = chromadb.PersistentClient(
|
|
||||||
path=config.RAG_PERSIST_DIR,
|
|
||||||
settings=Settings(anonymized_telemetry=False),
|
|
||||||
)
|
|
||||||
return _store
|
|
||||||
|
|
||||||
def _collection(name):
|
def get_embedding(text):
|
||||||
"""Get or create collection — uses ChromaDB's default ONNX embedding (all-MiniLM-L6-v2)."""
|
"""Fungsi standar untuk menghasilkan embedding"""
|
||||||
return _get_store().get_or_create_collection(name=name)
|
if embedding_model is None:
|
||||||
|
raise Exception("Embedding model not loaded. Check your config or internet connection.")
|
||||||
|
return embedding_model.encode(text).tolist()
|
||||||
|
|
||||||
|
# Skema sederhana untuk menghindari konflik Pydantic
|
||||||
|
class DocumentSchema(LanceModel):
|
||||||
|
text: str
|
||||||
|
id: str
|
||||||
|
metadata: str
|
||||||
|
vector: list[float]
|
||||||
|
|
||||||
|
# ── LanceDB singleton ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_db = None
|
||||||
|
|
||||||
|
def _get_db():
|
||||||
|
global _db
|
||||||
|
if _db is None:
|
||||||
|
_db = lancedb.connect(config.RAG_PERSIST_DIR)
|
||||||
|
return _db
|
||||||
|
|
||||||
|
def _get_table(name):
|
||||||
|
db = _get_db()
|
||||||
|
if name in db.table_names():
|
||||||
|
return db.open_table(name)
|
||||||
|
return db.create_table(name, schema=DocumentSchema)
|
||||||
|
|
||||||
# ── Tool schemas ─────────────────────────────────────────────────────
|
# ── Tool schemas ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
@ -74,9 +123,8 @@ schema_search_knowledge = {
|
|||||||
"name": "search_knowledge",
|
"name": "search_knowledge",
|
||||||
"description": (
|
"description": (
|
||||||
"Semantically search a RAG collection. Optionally narrow with a "
|
"Semantically search a RAG collection. Optionally narrow with a "
|
||||||
"metadata filter using ChromaDB where syntax. "
|
"metadata filter using SQL-like syntax. "
|
||||||
"Examples: {'category': 'main_course'}, {'spice_level': {'$lte': 2}}, "
|
"Example: \"metadata LIKE '%main_course%'\""
|
||||||
"{'allergens': {'$contains': 'seafood'}}."
|
|
||||||
),
|
),
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
@ -95,8 +143,8 @@ schema_search_knowledge = {
|
|||||||
"default": 5
|
"default": 5
|
||||||
},
|
},
|
||||||
"filter": {
|
"filter": {
|
||||||
"type": "object",
|
"type": "string",
|
||||||
"description": "Optional metadata filter dict",
|
"description": "Optional SQL-like filter for metadata JSON string",
|
||||||
"default": None
|
"default": None
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -185,7 +233,6 @@ schema_inspect_collection = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
schema_ingest_files = {
|
schema_ingest_files = {
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
@ -229,138 +276,134 @@ schema_ingest_files = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# ── Tool handlers ────────────────────────────────────────────────────
|
# ── Tool handlers ────────────────────────────────────────────────────
|
||||||
|
|
||||||
def _sanitize_meta(meta):
|
|
||||||
"""ChromaDB metadata only allows str/int/float/bool. Convert lists to JSON string, remove empty lists."""
|
|
||||||
out = {}
|
|
||||||
for k, v in meta.items():
|
|
||||||
if isinstance(v, list):
|
|
||||||
if len(v) == 0:
|
|
||||||
continue
|
|
||||||
out[k] = json.dumps(v, ensure_ascii=False)
|
|
||||||
elif isinstance(v, (str, int, float, bool)):
|
|
||||||
out[k] = v
|
|
||||||
else:
|
|
||||||
out[k] = str(v)
|
|
||||||
return out
|
|
||||||
|
|
||||||
def store_knowledge(collection, documents):
|
def store_knowledge(collection, documents):
|
||||||
try:
|
try:
|
||||||
col = _collection(collection)
|
table = _get_table(collection)
|
||||||
ids, texts, metas = [], [], []
|
data = []
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
ids.append(doc["id"])
|
data.append({
|
||||||
texts.append(doc["text"])
|
"id": doc["id"],
|
||||||
metas.append(_sanitize_meta(doc.get("metadata", {})))
|
"text": doc["text"],
|
||||||
col.add(ids=ids, documents=texts, metadatas=metas)
|
"metadata": json.dumps(doc.get("metadata", {}), ensure_ascii=False),
|
||||||
|
"vector": get_embedding(doc["text"])
|
||||||
|
})
|
||||||
|
table.add(data)
|
||||||
return f"Stored {len(documents)} document(s) in '{collection}'."
|
return f"Stored {len(documents)} document(s) in '{collection}'."
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"Error: {e}"
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
|
||||||
def search_knowledge(collection, query, n_results=5, filter=None):
|
def search_knowledge(collection, query, n_results=5, filter=None):
|
||||||
try:
|
try:
|
||||||
col = _collection(collection)
|
table = _get_table(collection)
|
||||||
kw = {"query_texts": [query], "n_results": n_results}
|
# LanceDB semantic search
|
||||||
|
query_vector = get_embedding(query)
|
||||||
|
res = table.search(query_vector).limit(n_results)
|
||||||
|
|
||||||
if filter:
|
if filter:
|
||||||
kw["where"] = filter
|
res = table.search(query_vector).where(filter).limit(n_results)
|
||||||
r = col.query(**kw)
|
|
||||||
if not r["ids"] or not r["ids"][0]:
|
df = res.to_pandas()
|
||||||
|
|
||||||
|
if df.empty:
|
||||||
return "No results found."
|
return "No results found."
|
||||||
|
|
||||||
out = []
|
out = []
|
||||||
for i in range(len(r["ids"][0])):
|
for _, row in df.iterrows():
|
||||||
did = r["ids"][0][i]
|
did = row["id"]
|
||||||
txt = r["documents"][0][i]
|
txt = row["text"]
|
||||||
if len(txt) > 500:
|
if len(txt) > 500:
|
||||||
txt = txt[:500] + "..."
|
txt = txt[:500] + "..."
|
||||||
meta = json.dumps(r["metadatas"][0][i], ensure_ascii=False) if r.get("metadatas") else "{}"
|
meta = row["metadata"]
|
||||||
dist = ""
|
out.append(f"[{did}]\n text: {txt}\n metadata: {meta}")
|
||||||
if r.get("distances"):
|
|
||||||
dist = f" (score: {r['distances'][0][i]:.4f})"
|
|
||||||
out.append(f"[{did}]{dist}\n text: {txt}\n metadata: {meta}")
|
|
||||||
return "\n---\n".join(out)
|
return "\n---\n".join(out)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"Error: {e}"
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
|
||||||
def create_collection(name, description=""):
|
def create_collection(name, description=""):
|
||||||
try:
|
try:
|
||||||
col = _get_store().get_or_create_collection(name=name)
|
_get_table(name)
|
||||||
col.modify(metadata={"description": description})
|
|
||||||
return f"Collection '{name}' is ready."
|
return f"Collection '{name}' is ready."
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"Error: {e}"
|
return f"Error: {e}"
|
||||||
|
|
||||||
def delete_collection(name):
|
def delete_collection(name):
|
||||||
try:
|
try:
|
||||||
_get_store().delete_collection(name)
|
db = _get_db()
|
||||||
|
table_path = os.path.join(config.RAG_PERSIST_DIR, name)
|
||||||
|
if os.path.exists(table_path):
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(table_path)
|
||||||
return f"Deleted collection '{name}'."
|
return f"Deleted collection '{name}'."
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"Error: {e}"
|
return f"Error: {e}"
|
||||||
|
|
||||||
def list_collections():
|
def list_collections():
|
||||||
try:
|
try:
|
||||||
cols = _get_store().list_collections()
|
db = _get_db()
|
||||||
|
cols = db.table_names()
|
||||||
if not cols:
|
if not cols:
|
||||||
return "No collections exist yet."
|
return "No collections exist yet."
|
||||||
|
|
||||||
out = ["Available collections:"]
|
out = ["Available collections:"]
|
||||||
for col in cols:
|
for col in cols:
|
||||||
meta = col.metadata or {}
|
table = db.open_table(col)
|
||||||
desc = meta.get("description", "")
|
cnt = len(table.to_pandas())
|
||||||
cnt = col.count()
|
out.append(f"- {col} [{cnt} docs]")
|
||||||
tag = f" ({desc})" if desc else ""
|
|
||||||
out.append(f"- {col.name}{tag} [{cnt} docs]")
|
|
||||||
return "\n".join(out)
|
return "\n".join(out)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"Error: {e}"
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
|
||||||
def inspect_collection(collection, sample_size=3):
|
def inspect_collection(collection, sample_size=3):
|
||||||
try:
|
try:
|
||||||
col = _collection(collection)
|
table = _get_table(collection)
|
||||||
cnt = col.count()
|
df = table.to_pandas()
|
||||||
|
cnt = len(df)
|
||||||
if cnt == 0:
|
if cnt == 0:
|
||||||
return f"Collection '{collection}' is empty."
|
return f"Collection '{collection}' is empty."
|
||||||
|
|
||||||
n = min(sample_size, cnt)
|
n = min(sample_size, cnt)
|
||||||
r = col.get(limit=n, include=["documents", "metadatas"])
|
sample = df.head(n)
|
||||||
|
|
||||||
out = [f"Collection: {collection} | Total documents: {cnt}", f"Sample ({n}):"]
|
out = [f"Collection: {collection} | Total documents: {cnt}", f"Sample ({n}):"]
|
||||||
for i in range(len(r["ids"])):
|
for _, row in sample.iterrows():
|
||||||
txt = r["documents"][i]
|
txt = row["text"]
|
||||||
if len(txt) > 200:
|
if len(txt) > 200:
|
||||||
txt = txt[:200] + "..."
|
txt = txt[:200] + "..."
|
||||||
meta = json.dumps(r["metadatas"][i], ensure_ascii=False) if r.get("metadatas") and r["metadatas"][i] else "(none)"
|
meta = row["metadata"]
|
||||||
out.append(f"\n [{r['ids'][i]}] text: {txt} metadata: {meta}")
|
out.append(f"\n [{row['id']}] text: {txt} metadata: {meta}")
|
||||||
|
|
||||||
keys = set()
|
keys = set()
|
||||||
for m in r["metadatas"]:
|
for m_str in sample["metadata"]:
|
||||||
if m:
|
try:
|
||||||
keys.update(m.keys())
|
m_dict = json.loads(m_str)
|
||||||
|
keys.update(m_dict.keys())
|
||||||
|
except:
|
||||||
|
pass
|
||||||
if keys:
|
if keys:
|
||||||
out.append(f"\nMetadata keys: {', '.join(sorted(keys))}")
|
out.append(f"\nMetadata keys: {', '.join(sorted(keys))}")
|
||||||
|
|
||||||
return "\n".join(out)
|
return "\n".join(out)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"Error: {e}"
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
|
||||||
def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=True):
|
def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=True):
|
||||||
try:
|
try:
|
||||||
col = _collection(collection)
|
table = _get_table(collection)
|
||||||
all_ids, all_texts, all_metas = [], [], []
|
all_data = []
|
||||||
processed, skipped = 0, 0
|
processed, skipped = 0, 0
|
||||||
|
|
||||||
# Expand glob patterns into real file paths
|
|
||||||
file_set = set()
|
file_set = set()
|
||||||
for p in paths:
|
for p in paths:
|
||||||
expanded = globmod.glob(p, recursive=recursive)
|
expanded = globmod.glob(p, recursive=recursive)
|
||||||
if expanded:
|
if expanded:
|
||||||
file_set.update(expanded)
|
file_set.update(expanded)
|
||||||
|
elif os.path.isfile(p):
|
||||||
|
file_set.add(p)
|
||||||
else:
|
else:
|
||||||
# Maybe it's a literal path that doesn't look like a glob
|
skipped += 1
|
||||||
if os.path.isfile(p):
|
|
||||||
file_set.add(p)
|
|
||||||
else:
|
|
||||||
skipped += 1
|
|
||||||
|
|
||||||
if not file_set:
|
if not file_set:
|
||||||
return "No matching files found."
|
return "No matching files found."
|
||||||
@ -381,7 +424,6 @@ def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=Tru
|
|||||||
}
|
}
|
||||||
base_name = os.path.splitext(os.path.basename(fpath))[0]
|
base_name = os.path.splitext(os.path.basename(fpath))[0]
|
||||||
|
|
||||||
# ── read content ──────────────────────────────────────────
|
|
||||||
if ext in (".xlsx", ".xlsm"):
|
if ext in (".xlsx", ".xlsm"):
|
||||||
try:
|
try:
|
||||||
import openpyxl
|
import openpyxl
|
||||||
@ -396,6 +438,7 @@ def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=Tru
|
|||||||
for row in ws.iter_rows(values_only=True):
|
for row in ws.iter_rows(values_only=True):
|
||||||
vals = [str(c) if c is not None else "" for c in row]
|
vals = [str(c) if c is not None else "" for c in row]
|
||||||
rows.append("\t".join(vals))
|
rows.append("\t".join(vals))
|
||||||
|
|
||||||
lines = rows
|
lines = rows
|
||||||
content = "\n".join(lines)
|
content = "\n".join(lines)
|
||||||
if not content.strip():
|
if not content.strip():
|
||||||
@ -416,22 +459,27 @@ def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=Tru
|
|||||||
meta["chunk_index"] = cid
|
meta["chunk_index"] = cid
|
||||||
meta["chunk_lines"] = end - start
|
meta["chunk_lines"] = end - start
|
||||||
meta["chunk_start_line"] = start + 1
|
meta["chunk_start_line"] = start + 1
|
||||||
all_ids.append(doc_id)
|
all_data.append({
|
||||||
all_texts.append(chunk_text)
|
"id": doc_id,
|
||||||
all_metas.append(_sanitize_meta(meta))
|
"text": chunk_text,
|
||||||
|
"metadata": json.dumps(meta, ensure_ascii=False),
|
||||||
|
"vector": get_embedding(chunk_text)
|
||||||
|
})
|
||||||
cid += 1
|
cid += 1
|
||||||
step = chunk_size - chunk_overlap
|
step = chunk_size - chunk_overlap
|
||||||
start += step if step > 0 else 1
|
start += step if step > 0 else 1
|
||||||
processed += 1
|
processed += 1
|
||||||
else:
|
else:
|
||||||
doc_id = f"{base_name}_{sheet_name}"
|
doc_id = f"{base_name}_{sheet_name}"
|
||||||
all_ids.append(doc_id)
|
all_data.append({
|
||||||
all_texts.append(content)
|
"id": doc_id,
|
||||||
all_metas.append(_sanitize_meta(sheet_meta))
|
"text": content,
|
||||||
|
"metadata": json.dumps(sheet_meta, ensure_ascii=False),
|
||||||
|
"vector": get_embedding(content)
|
||||||
|
})
|
||||||
processed += 1
|
processed += 1
|
||||||
wb.close()
|
wb.close()
|
||||||
else:
|
else:
|
||||||
# Plain-text files
|
|
||||||
try:
|
try:
|
||||||
with open(fpath, "r", encoding="utf-8", errors="replace") as f:
|
with open(fpath, "r", encoding="utf-8", errors="replace") as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
@ -456,26 +504,32 @@ def ingest_files(collection, paths, chunk_size=0, chunk_overlap=0, recursive=Tru
|
|||||||
meta["chunk_index"] = cid
|
meta["chunk_index"] = cid
|
||||||
meta["chunk_lines"] = end - start
|
meta["chunk_lines"] = end - start
|
||||||
meta["chunk_start_line"] = start + 1
|
meta["chunk_start_line"] = start + 1
|
||||||
all_ids.append(doc_id)
|
all_data.append({
|
||||||
all_texts.append(chunk_text)
|
"id": doc_id,
|
||||||
all_metas.append(_sanitize_meta(meta))
|
"text": chunk_text,
|
||||||
|
"metadata": json.dumps(meta, ensure_ascii=False),
|
||||||
|
"vector": get_embedding(chunk_text)
|
||||||
|
})
|
||||||
cid += 1
|
cid += 1
|
||||||
step = chunk_size - chunk_overlap
|
step = chunk_size - chunk_overlap
|
||||||
start += step if step > 0 else 1
|
start += step if step > 0 else 1
|
||||||
processed += 1
|
processed += 1
|
||||||
else:
|
else:
|
||||||
doc_id = base_name
|
doc_id = base_name
|
||||||
all_ids.append(doc_id)
|
all_data.append({
|
||||||
all_texts.append(content)
|
"id": doc_id,
|
||||||
all_metas.append(_sanitize_meta(base_meta))
|
"text": content,
|
||||||
|
"metadata": json.dumps(base_meta, ensure_ascii=False),
|
||||||
|
"vector": get_embedding(content)
|
||||||
|
})
|
||||||
processed += 1
|
processed += 1
|
||||||
|
|
||||||
if all_ids:
|
if all_data:
|
||||||
col.add(ids=all_ids, documents=all_texts, metadatas=all_metas)
|
table.add(all_data)
|
||||||
|
|
||||||
parts = [f"Ingested {processed} file(s) into '{collection}'"]
|
parts = [f"Ingested {processed} file(s) into '{collection}'"]
|
||||||
if processed > 0:
|
if processed > 0:
|
||||||
parts.append(f"({len(all_ids)} document(s) total)")
|
parts.append(f"({len(all_data)} document(s) total)")
|
||||||
if skipped > 0:
|
if skipped > 0:
|
||||||
parts.append(f"({skipped} file(s) skipped)")
|
parts.append(f"({skipped} file(s) skipped)")
|
||||||
return " ".join(parts)
|
return " ".join(parts)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user