50 lines
2.1 KiB
Python
50 lines
2.1 KiB
Python
#!/usr/bin/env python3
|
|
import json, urllib.request, sys, os, hashlib
|
|
|
|
OLLAMA = "http://127.0.0.1:11434/api/embeddings"
|
|
QDRANT = "http://127.0.0.1:6333"
|
|
|
|
def embed(text):
|
|
try:
|
|
body = json.dumps({"model":"nomic-embed-text","prompt":text[:2000]}).encode()
|
|
req = urllib.request.Request(OLLAMA, data=body, headers={"Content-Type":"application/json"})
|
|
return json.loads(urllib.request.urlopen(req, timeout=30).read()).get("embedding")
|
|
except: return None
|
|
|
|
def qdrant_upsert(col, pid, vec, payload):
|
|
body = json.dumps({"points":[{"id":pid,"vector":vec,"payload":payload}]}).encode()
|
|
req = urllib.request.Request(QDRANT+"/collections/"+col+"/points?wait=true",data=body,headers={"Content-Type":"application/json"},method="PUT")
|
|
return json.loads(urllib.request.urlopen(req,timeout=10).read())
|
|
|
|
def ensure_col(name, size):
|
|
try: urllib.request.urlopen(QDRANT+"/collections/"+name,timeout=5)
|
|
except:
|
|
body = json.dumps({"vectors":{"size":size,"distance":"Cosine"}}).encode()
|
|
req = urllib.request.Request(QDRANT+"/collections/"+name,data=body,headers={"Content-Type":"application/json"},method="PUT")
|
|
urllib.request.urlopen(req,timeout=10)
|
|
print("Created: "+name)
|
|
|
|
def run(kb_dir="/opt/wevia-brain/knowledge"):
|
|
files = []
|
|
for root, dirs, fnames in os.walk(kb_dir):
|
|
for f in fnames:
|
|
if f.endswith((".md",".json",".txt")):
|
|
files.append(os.path.join(root,f))
|
|
test = embed("test")
|
|
if not test: print("Ollama embed failed"); return
|
|
ensure_col("wevia_kb", len(test))
|
|
ok = 0
|
|
for fp in files:
|
|
content = open(fp).read()
|
|
if len(content) < 50: continue
|
|
chunks = [content[i:i+1500] for i in range(0,len(content),1200)]
|
|
for ci, chunk in enumerate(chunks):
|
|
vec = embed(chunk)
|
|
if not vec: continue
|
|
pid = int(hashlib.md5((fp+str(ci)).encode()).hexdigest()[:8],16)
|
|
qdrant_upsert("wevia_kb",pid,vec,{"file":os.path.basename(fp),"chunk":ci,"text":chunk[:500]})
|
|
ok += 1
|
|
print("Done: "+str(ok)+" chunks from "+str(len(files))+" files")
|
|
|
|
if __name__=="__main__": run()
|