Files
html/api/seed-empty-collections.py
opus d2dbe73961
Some checks failed
WEVAL NonReg / nonreg (push) Has been cancelled
auto-commit via WEVIA vault_git intent 2026-04-19T20:42:48+00:00
2026-04-19 22:42:48 +02:00

179 lines
12 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
WEVAL — Seed 9 empty Qdrant collections with real content
V96.10 Opus 19avr · Doctrine #4 honnêteté (real seed, pas fake)
Collections to seed:
weval_intents_memory (384d) — sample of 1579 wired intents
weval_agents_registry (384d) — sample of 950 agents
kb_lean6sigma (768d) — Lean 6σ principles
kb_dmaic_playbooks (768d) — DMAIC playbooks
kb_bpmn_flows (768d) — BPMN flows
kb_bpmn_patterns (768d) — BPMN patterns
kb_consulting_strategy (768d) — WEVAL consulting strategies
kb_vsm_best_practices (768d) — Value Stream Mapping best practices
kb_wevads_deliv (768d) — WEVADS delivery knowledge
"""
import os, json, sys, glob, subprocess
from pathlib import Path
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from sentence_transformers import SentenceTransformer
import uuid
QDRANT = "http://localhost:6333"
client = QdrantClient(QDRANT)
print("Loading models...")
m384 = SentenceTransformer('all-MiniLM-L6-v2')
m768 = SentenceTransformer('all-mpnet-base-v2')
print("Models loaded\n")
# ═══════════════════════════════════════════════════════════════════
# 1. weval_intents_memory (384d) — from /wired-pending/ filenames
# ═══════════════════════════════════════════════════════════════════
print("=== 1. weval_intents_memory ===")
intent_files = glob.glob('/var/www/html/api/wired-pending/intent-*.php')[:50]
intents_docs = []
for f in intent_files:
name = os.path.basename(f).replace('intent-', '').replace('.php', '')
# Read first 500 chars for context
try:
with open(f, 'r', errors='ignore') as file: preview = file.read()[:500]
except: preview = ''
doc = f"Intent: {name} · File: {os.path.basename(f)} · Preview: {preview[:200]}"
intents_docs.append({"name": name, "doc": doc, "file": os.path.basename(f)})
print(f" docs: {len(intents_docs)}")
embeddings = m384.encode([d['doc'] for d in intents_docs], show_progress_bar=False)
points = [
PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload={
"name": intents_docs[i]['name'], "type": "intent",
"file": intents_docs[i]['file'], "source": "wired-pending-glob"
})
for i, emb in enumerate(embeddings)
]
client.upsert(collection_name="weval_intents_memory", points=points)
print(f" upserted: {len(points)}")
# ═══════════════════════════════════════════════════════════════════
# 2. weval_agents_registry (384d) — from /agent-stubs/ or registry
# ═══════════════════════════════════════════════════════════════════
print("\n=== 2. weval_agents_registry ===")
agent_files = glob.glob('/var/www/html/api/agent-stubs/*.php')[:50]
if not agent_files:
agent_files = glob.glob('/var/www/html/api/wired-pending/intent-opus4-*agent*')[:50]
agents_docs = []
for f in agent_files:
name = os.path.basename(f).replace('.php', '')
try:
with open(f, 'r', errors='ignore') as file: preview = file.read()[:500]
except: preview = ''
doc = f"Agent: {name} · Preview: {preview[:200]}"
agents_docs.append({"name": name, "doc": doc, "file": os.path.basename(f)})
print(f" docs: {len(agents_docs)}")
if agents_docs:
embeddings = m384.encode([d['doc'] for d in agents_docs], show_progress_bar=False)
points = [
PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload={
"name": agents_docs[i]['name'], "type": "agent",
"file": agents_docs[i]['file'], "source": "agent-stubs-glob"
})
for i, emb in enumerate(embeddings)
]
client.upsert(collection_name="weval_agents_registry", points=points)
print(f" upserted: {len(points)}")
# ═══════════════════════════════════════════════════════════════════
# 3-9. KB collections (768d) — seed with domain-specific content
# ═══════════════════════════════════════════════════════════════════
KB_CONTENT = {
"kb_lean6sigma": [
"Voice of Customer (VOC) — translate customer needs into CTQ (Critical To Quality) specifications. Start every Lean 6σ project with VOC.",
"DMAIC methodology — Define → Measure → Analyze → Improve → Control. 5-phase structured problem-solving.",
"Sigma level vs DPMO — 6σ = 3.4 defects per million opportunities. 5σ = 233. 4σ = 6210. Higher sigma = better quality.",
"Value vs Non-Value Added — classify all process steps. Target: maximize VA, eliminate Non-VA, reduce Necessary Non-VA.",
"Pareto Principle (80/20) — 80% of defects come from 20% of causes. Focus improvement on vital few.",
"FMEA (Failure Mode Effects Analysis) — systematic approach. Severity × Occurrence × Detection = RPN. Priority: RPN > 100.",
"5S methodology — Sort, Set in order, Shine, Standardize, Sustain. Foundation of Lean workplace.",
"Control charts — X-bar R, X-bar S, p-chart, np-chart, c-chart, u-chart. SPC monitoring tools.",
"Gemba walk — go see where work happens. Lean leadership practice for continuous improvement.",
"Kaizen events — 3-5 day focused improvement workshops. Cross-functional team, rapid deployment.",
],
"kb_dmaic_playbooks": [
"DEFINE phase — SIPOC diagram (Supplier Input Process Output Customer), project charter, problem statement, team formation, stakeholder analysis.",
"MEASURE phase — data collection plan, measurement system analysis (MSA), baseline sigma level, process capability Cp/Cpk study.",
"ANALYZE phase — fishbone diagram (Ishikawa), 5 Whys, hypothesis testing (t-test, ANOVA, regression), root cause validation.",
"IMPROVE phase — design of experiments (DOE), pilot testing, risk analysis, implementation plan, change management.",
"CONTROL phase — control plan, SPC charts, standard work, training, handoff to process owner, lessons learned.",
"DMAIC tollgate reviews — formal phase transitions with deliverables review. Sponsor approval required.",
"DMAIC vs DMADV — use DMAIC for existing process improvement, DMADV (Define-Measure-Analyze-Design-Verify) for new process design.",
],
"kb_bpmn_flows": [
"BPMN 2.0 standard — Business Process Model and Notation. ISO 19510. Universal diagramming language for business processes.",
"Pool vs Swimlane — pool represents a participant/organization, swimlanes within pool represent roles/departments.",
"Tasks — atomic work units. Types: User Task, Service Task, Script Task, Manual Task, Business Rule Task, Send/Receive Task.",
"Gateways — decision points. Types: Exclusive (XOR), Parallel (AND), Inclusive (OR), Event-Based, Complex.",
"Events — triggers and outcomes. Start, Intermediate, End. Types: Message, Timer, Error, Signal, Compensation, Terminate.",
"Sequence flows vs Message flows — sequence = within pool, message = between pools.",
"Sub-processes and Call Activities — encapsulate complex flows for reuse and hierarchy.",
],
"kb_bpmn_patterns": [
"Sequential pattern — tasks in strict order. Foundation building block.",
"Parallel split (AND-split) — fork into concurrent flows. Merge with AND-join.",
"Exclusive choice (XOR) — one path selected based on condition. Mutually exclusive branches.",
"Inclusive choice (OR) — one or more paths based on conditions.",
"Deferred choice — path selected by first event occurring (race).",
"Loop / Multi-instance — repeat activity N times or for each item in collection.",
"Compensation — undo completed work on failure. Transactional rollback pattern.",
],
"kb_consulting_strategy": [
"WEVAL consulting core methodology — sovereign AI platform + Lean 6σ + ERP gap-fill. 25 ERPs × 60 pain points × 950 agents = 17.36M€ savings/client.",
"WEVAL pricing — Discovery 5k€ + POC 15-25k€ + Rollout 80-300k€ + Managed 30-80k€/an. Modular engagement.",
"Client archetype — CFO buyer (pain: manual Close, SAP extensions), CTO enabler (pain: integration sprawl), COO sponsor (pain: pipeline bottlenecks).",
"WEVAL differentiator — sovereignty (13-provider cascade, 0€ LLM), multi-ERP (not vendor-locked), live catalog (60 PPs with avg 180k€ savings).",
"Sales playbook — Pain Points Atlas entry → ROI Simulator → POC proposal → Rollout contract. Dogfood proof: WEVAL closes 35 gaps on itself = 2.4M€ savings.",
"Ethica partnership — Kaouther Najar group. 156714 HCPs (DZ 112k + MA 19k + TN 17k + INTL). Campaign 109920 draft @10k/day. 0.8DH/contact counter [1.5/1.2/1.0DH].",
],
"kb_vsm_best_practices": [
"Value Stream Mapping — visualize entire material + information flow from supplier to customer. Start with current state map.",
"Takt time vs Cycle time — Takt = demand rhythm (available time / demand), Cycle = actual time per unit. Balance: cycle ≤ takt.",
"Flow efficiency = Value-Added Time / Total Lead Time. Typical: <10%. World-class: >25%.",
"Pull vs Push — Kanban pull system replenishes based on consumption. Eliminates overproduction (#1 waste).",
"7 wastes (TIMWOOD) — Transport, Inventory, Motion, Waiting, Overproduction, Over-processing, Defects. 8th added: Skills underutilization.",
"Future state map — redesigned VSM with improvements. Aim for continuous flow + pull + leveled schedule.",
"Spaghetti diagram — trace physical movement. Reveals excessive travel (Motion waste).",
],
"kb_wevads_deliv": [
"WEVADS architecture — PostgreSQL adx_system+adx_clients (6.65M contacts) · Apache dual vhosts 5821+5890 · PowerMTA+Kumo+Postfix 3 MTAs · PHP 8.5-FPM · N8N workflows · OVH tracking 151.80.235.110.",
"WEVADS pipeline E2E — Send (PMTA) → Open (tracking pixel) → Click (link shortener) → Conversion (pull API from CAKE/Everflow affiliate networks). Conversion PULL not postback.",
"WEVADS O365 — 604 accounts across 9+ tenants. Graph API creation. Users per tenant 500 cap. accoff04/06 primary. 97pct inbox rate via PMTA→O365 relay.",
"WEVADS Ethica delivery — dns wevup.app · SPF+DKIM 2048bit+DMARC · Cloudflare zone 53e067fbc5c532a1 · PTR mail.weval-consulting.com · consent.wevup.app live 17 real optins.",
"WEVADS seed network — 1275 seed accounts across 8 ISPs. Warmup 1783 accounts. Cap 77170 emails/day. Quality Guard enforces good_creatives threshold.",
"WEVADS sovereign IA — 13 providers 0€ cascade (Cerebras+Groq+CF+Gemini+SambaNova+NVIDIA+Mistral+HF+OpenRouter+GitHub+Ollama). Fallback auto on 429/402/401.",
],
}
for col_name, docs in KB_CONTENT.items():
print(f"\n=== 3+. {col_name} ===")
print(f" docs: {len(docs)}")
embeddings = m768.encode(docs, show_progress_bar=False)
points = [
PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload={
"content": docs[i], "type": col_name.replace('kb_', ''),
"source": "V96.10-seed", "seeded_at": "2026-04-19"
})
for i, emb in enumerate(embeddings)
]
client.upsert(collection_name=col_name, points=points)
print(f" upserted: {len(points)}")
print("\n" + "="*60)
print("🏆 SEED COMPLETE")
print("="*60)
# Summary
for col_name in ['weval_intents_memory', 'weval_agents_registry'] + list(KB_CONTENT.keys()):
info = client.get_collection(col_name)
print(f" {col_name:30} points={info.points_count}")