Files
wevia-brain/wevia-vectorize.py
2026-04-12 23:01:36 +02:00

50 lines
2.1 KiB
Python

#!/usr/bin/env python3
import json, urllib.request, sys, os, hashlib
OLLAMA = "http://127.0.0.1:11434/api/embeddings"
QDRANT = "http://127.0.0.1:6333"
def embed(text):
try:
body = json.dumps({"model":"nomic-embed-text","prompt":text[:2000]}).encode()
req = urllib.request.Request(OLLAMA, data=body, headers={"Content-Type":"application/json"})
return json.loads(urllib.request.urlopen(req, timeout=30).read()).get("embedding")
except: return None
def qdrant_upsert(col, pid, vec, payload):
body = json.dumps({"points":[{"id":pid,"vector":vec,"payload":payload}]}).encode()
req = urllib.request.Request(QDRANT+"/collections/"+col+"/points?wait=true",data=body,headers={"Content-Type":"application/json"},method="PUT")
return json.loads(urllib.request.urlopen(req,timeout=10).read())
def ensure_col(name, size):
try: urllib.request.urlopen(QDRANT+"/collections/"+name,timeout=5)
except:
body = json.dumps({"vectors":{"size":size,"distance":"Cosine"}}).encode()
req = urllib.request.Request(QDRANT+"/collections/"+name,data=body,headers={"Content-Type":"application/json"},method="PUT")
urllib.request.urlopen(req,timeout=10)
print("Created: "+name)
def run(kb_dir="/opt/wevia-brain/knowledge"):
files = []
for root, dirs, fnames in os.walk(kb_dir):
for f in fnames:
if f.endswith((".md",".json",".txt")):
files.append(os.path.join(root,f))
test = embed("test")
if not test: print("Ollama embed failed"); return
ensure_col("wevia_kb", len(test))
ok = 0
for fp in files:
content = open(fp).read()
if len(content) < 50: continue
chunks = [content[i:i+1500] for i in range(0,len(content),1200)]
for ci, chunk in enumerate(chunks):
vec = embed(chunk)
if not vec: continue
pid = int(hashlib.md5((fp+str(ci)).encode()).hexdigest()[:8],16)
qdrant_upsert("wevia_kb",pid,vec,{"file":os.path.basename(fp),"chunk":ci,"text":chunk[:500]})
ok += 1
print("Done: "+str(ok)+" chunks from "+str(len(files))+" files")
if __name__=="__main__": run()