html/api/seed-empty-collections.py

#!/usr/bin/env python3
"""
WEVAL — Seed 9 empty Qdrant collections with real content
V96.10 Opus 19avr · Doctrine #4 honnêteté (real seed, pas fake)

Collections to seed:
  weval_intents_memory (384d)   — sample of 1579 wired intents
  weval_agents_registry (384d)  — sample of 950 agents
  kb_lean6sigma (768d)          — Lean 6σ principles
  kb_dmaic_playbooks (768d)     — DMAIC playbooks
  kb_bpmn_flows (768d)          — BPMN flows
  kb_bpmn_patterns (768d)       — BPMN patterns
  kb_consulting_strategy (768d) — WEVAL consulting strategies
  kb_vsm_best_practices (768d)  — Value Stream Mapping best practices
  kb_wevads_deliv (768d)        — WEVADS delivery knowledge
"""
import os, json, sys, glob, subprocess
from pathlib import Path
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from sentence_transformers import SentenceTransformer
import uuid

QDRANT = "http://localhost:6333"
client = QdrantClient(QDRANT)

print("Loading models...")
m384 = SentenceTransformer('all-MiniLM-L6-v2')
m768 = SentenceTransformer('all-mpnet-base-v2')
print("Models loaded\n")

# ═══════════════════════════════════════════════════════════════════
# 1. weval_intents_memory (384d) — from /wired-pending/ filenames
# ═══════════════════════════════════════════════════════════════════
print("=== 1. weval_intents_memory ===")
intent_files = glob.glob('/var/www/html/api/wired-pending/intent-*.php')[:50]
intents_docs = []
for f in intent_files:
    name = os.path.basename(f).replace('intent-', '').replace('.php', '')
    # Read first 500 chars for context
    try:
        with open(f, 'r', errors='ignore') as file: preview = file.read()[:500]
    except: preview = ''
    doc = f"Intent: {name} · File: {os.path.basename(f)} · Preview: {preview[:200]}"
    intents_docs.append({"name": name, "doc": doc, "file": os.path.basename(f)})

print(f"  docs: {len(intents_docs)}")
embeddings = m384.encode([d['doc'] for d in intents_docs], show_progress_bar=False)
points = [
    PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload={
        "name": intents_docs[i]['name'], "type": "intent",
        "file": intents_docs[i]['file'], "source": "wired-pending-glob"
    })
    for i, emb in enumerate(embeddings)
]
client.upsert(collection_name="weval_intents_memory", points=points)
print(f"  upserted: {len(points)}")

# ═══════════════════════════════════════════════════════════════════
# 2. weval_agents_registry (384d) — from /agent-stubs/ or registry
# ═══════════════════════════════════════════════════════════════════
print("\n=== 2. weval_agents_registry ===")
agent_files = glob.glob('/var/www/html/api/agent-stubs/*.php')[:50]
if not agent_files:
    agent_files = glob.glob('/var/www/html/api/wired-pending/intent-opus4-*agent*')[:50]
agents_docs = []
for f in agent_files:
    name = os.path.basename(f).replace('.php', '')
    try:
        with open(f, 'r', errors='ignore') as file: preview = file.read()[:500]
    except: preview = ''
    doc = f"Agent: {name} · Preview: {preview[:200]}"
    agents_docs.append({"name": name, "doc": doc, "file": os.path.basename(f)})

print(f"  docs: {len(agents_docs)}")
if agents_docs:
    embeddings = m384.encode([d['doc'] for d in agents_docs], show_progress_bar=False)
    points = [
        PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload={
            "name": agents_docs[i]['name'], "type": "agent",
            "file": agents_docs[i]['file'], "source": "agent-stubs-glob"
        })
        for i, emb in enumerate(embeddings)
    ]
    client.upsert(collection_name="weval_agents_registry", points=points)
    print(f"  upserted: {len(points)}")

# ═══════════════════════════════════════════════════════════════════
# 3-9. KB collections (768d) — seed with domain-specific content
# ═══════════════════════════════════════════════════════════════════
KB_CONTENT = {
    "kb_lean6sigma": [
        "Voice of Customer (VOC) — translate customer needs into CTQ (Critical To Quality) specifications. Start every Lean 6σ project with VOC.",
        "DMAIC methodology — Define → Measure → Analyze → Improve → Control. 5-phase structured problem-solving.",
        "Sigma level vs DPMO — 6σ = 3.4 defects per million opportunities. 5σ = 233. 4σ = 6210. Higher sigma = better quality.",
        "Value vs Non-Value Added — classify all process steps. Target: maximize VA, eliminate Non-VA, reduce Necessary Non-VA.",
        "Pareto Principle (80/20) — 80% of defects come from 20% of causes. Focus improvement on vital few.",
        "FMEA (Failure Mode Effects Analysis) — systematic approach. Severity × Occurrence × Detection = RPN. Priority: RPN > 100.",
        "5S methodology — Sort, Set in order, Shine, Standardize, Sustain. Foundation of Lean workplace.",
        "Control charts — X-bar R, X-bar S, p-chart, np-chart, c-chart, u-chart. SPC monitoring tools.",
        "Gemba walk — go see where work happens. Lean leadership practice for continuous improvement.",
        "Kaizen events — 3-5 day focused improvement workshops. Cross-functional team, rapid deployment.",
    ],
    "kb_dmaic_playbooks": [
        "DEFINE phase — SIPOC diagram (Supplier Input Process Output Customer), project charter, problem statement, team formation, stakeholder analysis.",
        "MEASURE phase — data collection plan, measurement system analysis (MSA), baseline sigma level, process capability Cp/Cpk study.",
        "ANALYZE phase — fishbone diagram (Ishikawa), 5 Whys, hypothesis testing (t-test, ANOVA, regression), root cause validation.",
        "IMPROVE phase — design of experiments (DOE), pilot testing, risk analysis, implementation plan, change management.",
        "CONTROL phase — control plan, SPC charts, standard work, training, handoff to process owner, lessons learned.",
        "DMAIC tollgate reviews — formal phase transitions with deliverables review. Sponsor approval required.",
        "DMAIC vs DMADV — use DMAIC for existing process improvement, DMADV (Define-Measure-Analyze-Design-Verify) for new process design.",
    ],
    "kb_bpmn_flows": [
        "BPMN 2.0 standard — Business Process Model and Notation. ISO 19510. Universal diagramming language for business processes.",
        "Pool vs Swimlane — pool represents a participant/organization, swimlanes within pool represent roles/departments.",
        "Tasks — atomic work units. Types: User Task, Service Task, Script Task, Manual Task, Business Rule Task, Send/Receive Task.",
        "Gateways — decision points. Types: Exclusive (XOR), Parallel (AND), Inclusive (OR), Event-Based, Complex.",
        "Events — triggers and outcomes. Start, Intermediate, End. Types: Message, Timer, Error, Signal, Compensation, Terminate.",
        "Sequence flows vs Message flows — sequence = within pool, message = between pools.",
        "Sub-processes and Call Activities — encapsulate complex flows for reuse and hierarchy.",
    ],
    "kb_bpmn_patterns": [
        "Sequential pattern — tasks in strict order. Foundation building block.",
        "Parallel split (AND-split) — fork into concurrent flows. Merge with AND-join.",
        "Exclusive choice (XOR) — one path selected based on condition. Mutually exclusive branches.",
        "Inclusive choice (OR) — one or more paths based on conditions.",
        "Deferred choice — path selected by first event occurring (race).",
        "Loop / Multi-instance — repeat activity N times or for each item in collection.",
        "Compensation — undo completed work on failure. Transactional rollback pattern.",
    ],
    "kb_consulting_strategy": [
        "WEVAL consulting core methodology — sovereign AI platform + Lean 6σ + ERP gap-fill. 25 ERPs × 60 pain points × 950 agents = 17.36M€ savings/client.",
        "WEVAL pricing — Discovery 5k€ + POC 15-25k€ + Rollout 80-300k€ + Managed 30-80k€/an. Modular engagement.",
        "Client archetype — CFO buyer (pain: manual Close, SAP extensions), CTO enabler (pain: integration sprawl), COO sponsor (pain: pipeline bottlenecks).",
        "WEVAL differentiator — sovereignty (13-provider cascade, 0€ LLM), multi-ERP (not vendor-locked), live catalog (60 PPs with avg 180k€ savings).",
        "Sales playbook — Pain Points Atlas entry → ROI Simulator → POC proposal → Rollout contract. Dogfood proof: WEVAL closes 35 gaps on itself = 2.4M€ savings.",
        "Ethica partnership — Kaouther Najar group. 156714 HCPs (DZ 112k + MA 19k + TN 17k + INTL). Campaign 109920 draft @10k/day. 0.8DH/contact counter [1.5/1.2/1.0DH].",
    ],
    "kb_vsm_best_practices": [
        "Value Stream Mapping — visualize entire material + information flow from supplier to customer. Start with current state map.",
        "Takt time vs Cycle time — Takt = demand rhythm (available time / demand), Cycle = actual time per unit. Balance: cycle ≤ takt.",
        "Flow efficiency = Value-Added Time / Total Lead Time. Typical: <10%. World-class: >25%.",
        "Pull vs Push — Kanban pull system replenishes based on consumption. Eliminates overproduction (#1 waste).",
        "7 wastes (TIMWOOD) — Transport, Inventory, Motion, Waiting, Overproduction, Over-processing, Defects. 8th added: Skills underutilization.",
        "Future state map — redesigned VSM with improvements. Aim for continuous flow + pull + leveled schedule.",
        "Spaghetti diagram — trace physical movement. Reveals excessive travel (Motion waste).",
    ],
    "kb_wevads_deliv": [
        "WEVADS architecture — PostgreSQL adx_system+adx_clients (6.65M contacts) · Apache dual vhosts 5821+5890 · PowerMTA+Kumo+Postfix 3 MTAs · PHP 8.5-FPM · N8N workflows · OVH tracking 151.80.235.110.",
        "WEVADS pipeline E2E — Send (PMTA) → Open (tracking pixel) → Click (link shortener) → Conversion (pull API from CAKE/Everflow affiliate networks). Conversion PULL not postback.",
        "WEVADS O365 — 604 accounts across 9+ tenants. Graph API creation. Users per tenant 500 cap. accoff04/06 primary. 97pct inbox rate via PMTA→O365 relay.",
        "WEVADS Ethica delivery — dns wevup.app · SPF+DKIM 2048bit+DMARC · Cloudflare zone 53e067fbc5c532a1 · PTR mail.weval-consulting.com · consent.wevup.app live 17 real optins.",
        "WEVADS seed network — 1275 seed accounts across 8 ISPs. Warmup 1783 accounts. Cap 77170 emails/day. Quality Guard enforces good_creatives threshold.",
        "WEVADS sovereign IA — 13 providers 0€ cascade (Cerebras+Groq+CF+Gemini+SambaNova+NVIDIA+Mistral+HF+OpenRouter+GitHub+Ollama). Fallback auto on 429/402/401.",
    ],
}

for col_name, docs in KB_CONTENT.items():
    print(f"\n=== 3+. {col_name} ===")
    print(f"  docs: {len(docs)}")
    embeddings = m768.encode(docs, show_progress_bar=False)
    points = [
        PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload={
            "content": docs[i], "type": col_name.replace('kb_', ''),
            "source": "V96.10-seed", "seeded_at": "2026-04-19"
        })
        for i, emb in enumerate(embeddings)
    ]
    client.upsert(collection_name=col_name, points=points)
    print(f"  upserted: {len(points)}")

print("\n" + "="*60)
print("🏆 SEED COMPLETE")
print("="*60)
# Summary
for col_name in ['weval_intents_memory', 'weval_agents_registry'] + list(KB_CONTENT.keys()):
    info = client.get_collection(col_name)
    print(f"  {col_name:30} points={info.points_count}")