73 lines
2.0 KiB
Python
73 lines
2.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Embed obsidian vault markdown into Qdrant for semantic search"""
|
|
import os, json, hashlib, requests
|
|
|
|
VAULT = "/opt/obsidian-vault"
|
|
QDRANT = "http://127.0.0.1:6333"
|
|
OLLAMA = "http://127.0.0.1:11435"
|
|
COLLECTION = "obsidian_vault"
|
|
|
|
# Create collection if not exists
|
|
try:
|
|
r = requests.get(f"{QDRANT}/collections/{COLLECTION}", timeout=5)
|
|
if r.status_code != 200:
|
|
requests.put(f"{QDRANT}/collections/{COLLECTION}", json={
|
|
"vectors": {"size": 384, "distance": "Cosine"}
|
|
}, timeout=10)
|
|
print(f"Created collection {COLLECTION}")
|
|
except Exception as e:
|
|
print(f"Qdrant error: {e}")
|
|
exit(1)
|
|
|
|
# Get embedding from Ollama
|
|
def embed(text):
|
|
try:
|
|
r = requests.post(f"{OLLAMA}/api/embed", json={
|
|
"model": "all-minilm", "input": text[:2000]
|
|
}, timeout=10)
|
|
return r.json().get("embeddings", [[]])[0]
|
|
except:
|
|
return None
|
|
|
|
# Scan vault
|
|
points = []
|
|
idx = 0
|
|
for root, dirs, files in os.walk(VAULT):
|
|
for f in files:
|
|
if not f.endswith(".md"):
|
|
continue
|
|
path = os.path.join(root, f)
|
|
rel = path.replace(VAULT + "/", "")
|
|
content = open(path).read()
|
|
|
|
# Skip frontmatter for embedding
|
|
if content.startswith("---"):
|
|
parts = content.split("---", 2)
|
|
if len(parts) >= 3:
|
|
content = parts[2].strip()
|
|
|
|
vec = embed(f"{rel}: {content[:500]}")
|
|
if not vec or len(vec) != 384:
|
|
continue
|
|
|
|
idx += 1
|
|
points.append({
|
|
"id": idx,
|
|
"vector": vec,
|
|
"payload": {
|
|
"file": rel,
|
|
"content": content[:500],
|
|
"size": len(content),
|
|
"source": "obsidian_vault"
|
|
}
|
|
})
|
|
|
|
# Upsert to Qdrant
|
|
if points:
|
|
r = requests.put(f"{QDRANT}/collections/{COLLECTION}/points", json={
|
|
"points": points
|
|
}, timeout=30)
|
|
print(f"Embedded {len(points)} notes into Qdrant ({r.status_code})")
|
|
else:
|
|
print("No points to embed")
|